From 57f46922e856e80c2974dd8e35e60318577fccdc Mon Sep 17 00:00:00 2001 From: Dan Hersam <142353+jaden@users.noreply.github.com> Date: Fri, 10 Nov 2023 03:14:34 -0500 Subject: Rework the behavior of favicon search (#5839) * Rework the behavior of favicon search Use xpath queries to reduce the filtering necessary. Don't call checkUrl since the href value will not always be a full URL. Check for a base path in the HTML to be used if the favicon href doesn't start with a leading slash. * Break long xpath query into two lines * Fix issues found by CI --- lib/favicons.php | 63 +++++++++++++++++++++++++++++++------------------------- 1 file changed, 35 insertions(+), 28 deletions(-) (limited to 'lib/favicons.php') diff --git a/lib/favicons.php b/lib/favicons.php index 8da38c519..a74f3331f 100644 --- a/lib/favicons.php +++ b/lib/favicons.php @@ -68,34 +68,41 @@ function downloadHttp(string &$url, array $curlOptions = []): string { function searchFavicon(string &$url): string { $dom = new DOMDocument(); $html = downloadHttp($url); - if ($html != '' && @$dom->loadHTML($html, LIBXML_NONET | LIBXML_NOERROR | LIBXML_NOWARNING)) { - $rels = array('shortcut icon', 'icon'); - $links = $dom->getElementsByTagName('link'); - foreach ($rels as $rel) { - foreach ($links as $link) { - if ($link->hasAttribute('rel') && $link->hasAttribute('href') && - strtolower(trim($link->getAttribute('rel'))) === $rel) { - $href = trim($link->getAttribute('href')); - if (substr($href, 0, 2) === '//') { - // Case of protocol-relative URLs - if (preg_match('%^(https?:)//%i', $url, $matches) === 1) { - $href = $matches[1] . $href; - } else { - $href = 'https:' . $href; - } - } - $checkUrl = checkUrl($href, false); - if (is_string($checkUrl)) { - $href = SimplePie_IRI::absolutize($url, $href); - } - $favicon = downloadHttp($href, array( - CURLOPT_REFERER => $url, - )); - if (isImgMime($favicon)) { - return $favicon; - } - } - } + + if ($html == '' || !@$dom->loadHTML($html, LIBXML_NONET | LIBXML_NOERROR | LIBXML_NOWARNING)) { + return ''; + } + + $xpath = new DOMXPath($dom); + $links = $xpath->query('//link[@href][translate(@rel, "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "abcdefghijklmnopqrstuvwxyz")="shortcut icon"' + . ' or translate(@rel, "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "abcdefghijklmnopqrstuvwxyz")="icon"]'); + + if (!$links) { + return ''; + } + + // Use the base element for relative paths, if there is one + $baseElements = $xpath->query('//base[@href]'); + $baseElement = ($baseElements !== false && $baseElements->length > 0) ? $baseElements->item(0) : null; + $baseUrl = ($baseElement instanceof DOMElement) ? $baseElement->getAttribute('href') : $url; + + foreach ($links as $link) { + if (!$link instanceof DOMElement) { + continue; + } + $href = trim($link->getAttribute('href')); + $urlParts = parse_url($url); + + // Handle protocol-relative URLs by adding the current URL's scheme + if (substr($href, 0, 2) === '//') { + $href = ($urlParts['scheme'] ?? 'https') . '://' . $href; + } + + $href = SimplePie_IRI::absolutize($baseUrl, $href); + + $favicon = downloadHttp($href, array(CURLOPT_REFERER => $url)); + if (isImgMime($favicon)) { + return $favicon; } } return ''; -- cgit v1.2.3