aboutsummaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
authorGravatar Alexandre Alapetite <alexandre@alapetite.fr> 2025-08-01 08:30:49 +0200
committerGravatar GitHub <noreply@github.com> 2025-08-01 08:30:49 +0200
commite915ebe46ecc76bd00e19a9cc63764ff2e277315 (patch)
tree4cb7caf3c8ad5328dab45097c3f92acabc621f5b /lib
parent188cc0d063b15be22cdd230b808c0bce5cff68e7 (diff)
Rework fetch favicons (#7767)
* Use main function `httpGet()` instead of local one; * Use HTTP cache, also between users; * Do not default to feed URL when there is no website URL TODO for later: consider supporting Atom's `<icon>` and RSS 2.0's `<image>` https://github.com/FreshRSS/FreshRSS/issues/7774
Diffstat (limited to 'lib')
-rw-r--r--lib/favicons.php89
-rw-r--r--lib/lib_rss.php23
2 files changed, 26 insertions, 86 deletions
diff --git a/lib/favicons.php b/lib/favicons.php
index e0baf542b..c28cdfc7f 100644
--- a/lib/favicons.php
+++ b/lib/favicons.php
@@ -22,81 +22,14 @@ function isImgMime(string $content): bool {
return $isImage;
}
-/** @param array<int,int|bool|string> $curlOptions */
-function downloadHttp(string &$url, array $curlOptions = []): string {
- if (($retryAfter = FreshRSS_http_Util::getRetryAfter($url)) > 0) {
- Minz_Log::warning('For that domain, will first retry favicon after ' . date('c', $retryAfter) . '. ' . \SimplePie\Misc::url_remove_credentials($url));
- return '';
- }
-
- syslog(LOG_INFO, 'FreshRSS Favicon GET ' . $url);
- $url2 = checkUrl($url);
- if ($url2 == false) {
- return '';
- }
- $url = $url2;
-
- $ch = curl_init($url);
- if ($ch === false) {
- return '';
- }
- curl_setopt_array($ch, [
- CURLOPT_HEADER => true,
- CURLOPT_RETURNTRANSFER => true,
- CURLOPT_TIMEOUT => 15,
- CURLOPT_USERAGENT => FRESHRSS_USERAGENT,
- CURLOPT_MAXREDIRS => 10,
- CURLOPT_FOLLOWLOCATION => true,
- CURLOPT_ENCODING => '', //Enable all encodings
- //CURLOPT_VERBOSE => 1, // To debug sent HTTP headers
- ]);
-
- FreshRSS_Context::initSystem();
- if (FreshRSS_Context::hasSystemConf()) {
- curl_setopt_array($ch, FreshRSS_Context::systemConf()->curl_options);
- }
-
- curl_setopt_array($ch, $curlOptions);
-
- $response = curl_exec($ch);
- $c_status = curl_getinfo($ch, CURLINFO_HTTP_CODE);
- $c_effective_url = curl_getinfo($ch, CURLINFO_EFFECTIVE_URL);
- curl_close($ch);
-
- $parser = new \SimplePie\HTTP\Parser(is_string($response) ? $response : '');
- if ($parser->parse()) {
- $headers = $parser->headers;
- $body = $parser->body;
- } else {
- $headers = [];
- $body = false;
- }
-
- if (in_array($c_status, [429, 503], true)) {
- $retryAfter = FreshRSS_http_Util::setRetryAfter($url, $headers['retry-after'] ?? '');
- if ($c_status === 429) {
- $errorMessage = 'HTTP 429 Too Many Requests! Searching favicon [' . \SimplePie\Misc::url_remove_credentials($url) . ']';
- } elseif ($c_status === 503) {
- $errorMessage = 'HTTP 503 Service Unavailable! Searching favicon [' . \SimplePie\Misc::url_remove_credentials($url) . ']';
- }
- if ($retryAfter > 0) {
- $errorMessage .= ' We may retry after ' . date('c', $retryAfter);
- }
- }
-
- $url2 = checkUrl($c_effective_url);
- if ($url2 != false) {
- $url = $url2; //Possible redirect
- }
-
- return $c_status === 200 && is_string($body) ? $body : '';
+function faviconCachePath(string $url): string {
+ return CACHE_PATH . '/' . sha1($url) . '.ico';
}
-function searchFavicon(string &$url): string {
+function searchFavicon(string $url): string {
$dom = new DOMDocument();
- $html = downloadHttp($url);
-
- if ($html == '' || !@$dom->loadHTML($html, LIBXML_NONET | LIBXML_NOERROR | LIBXML_NOWARNING)) {
+ ['body' => $html, 'effective_url' => $effective_url, 'fail' => $fail] = httpGet($url, cachePath: CACHE_PATH . '/' . sha1($url) . '.html', type: 'html');
+ if ($fail || $html === '' || !@$dom->loadHTML($html, LIBXML_NONET | LIBXML_NOERROR | LIBXML_NOWARNING)) {
return '';
}
@@ -110,14 +43,14 @@ function searchFavicon(string &$url): string {
// Use the base element for relative paths, if there is one
$baseElements = $xpath->query('//base[@href]');
$baseElement = ($baseElements !== false && $baseElements->length > 0) ? $baseElements->item(0) : null;
- $baseUrl = ($baseElement instanceof DOMElement) ? $baseElement->getAttribute('href') : $url;
+ $baseUrl = ($baseElement instanceof DOMElement) ? $baseElement->getAttribute('href') : $effective_url;
foreach ($links as $link) {
if (!$link instanceof DOMElement) {
continue;
}
$href = trim($link->getAttribute('href'));
- $urlParts = parse_url($url);
+ $urlParts = parse_url($effective_url);
// Handle protocol-relative URLs by adding the current URL's scheme
if (substr($href, 0, 2) === '//') {
@@ -133,7 +66,9 @@ function searchFavicon(string &$url): string {
if ($iri == false) {
return '';
}
- $favicon = downloadHttp($iri, [CURLOPT_REFERER => $url]);
+ $favicon = httpGet($iri, faviconCachePath($iri), 'ico', curl_options: [
+ CURLOPT_REFERER => $effective_url,
+ ])['body'];
if (isImgMime($favicon)) {
return $favicon;
}
@@ -152,7 +87,9 @@ function download_favicon(string $url, string $dest): bool {
}
if ($favicon == '') {
$link = $rootUrl . 'favicon.ico';
- $favicon = downloadHttp($link, [CURLOPT_REFERER => $url]);
+ $favicon = httpGet($link, faviconCachePath($link), 'ico', curl_options: [
+ CURLOPT_REFERER => $url,
+ ])['body'];
if (!isImgMime($favicon)) {
$favicon = '';
}
diff --git a/lib/lib_rss.php b/lib/lib_rss.php
index 25405017a..8954f9921 100644
--- a/lib/lib_rss.php
+++ b/lib/lib_rss.php
@@ -432,13 +432,9 @@ function sanitizeHTML(string $data, string $base = '', ?int $maxLength = null):
function cleanCache(int $hours = 720): void {
// N.B.: GLOB_BRACE is not available on all platforms
- $files = array_merge(
- glob(CACHE_PATH . '/*.html', GLOB_NOSORT) ?: [],
- glob(CACHE_PATH . '/*.json', GLOB_NOSORT) ?: [],
- glob(CACHE_PATH . '/*.spc', GLOB_NOSORT) ?: [],
- glob(CACHE_PATH . '/*.xml', GLOB_NOSORT) ?: []);
+ $files = glob(CACHE_PATH . '/*.*', GLOB_NOSORT) ?: [];
foreach ($files as $file) {
- if (substr($file, -10) === 'index.html') {
+ if (str_ends_with($file, 'index.html')) {
continue;
}
$cacheMtime = @filemtime($file);
@@ -543,7 +539,7 @@ function enforceHtmlBase(string $html, string $href): string {
}
/**
- * @param string $type {html,json,opml,xml}
+ * @param string $type {html,ico,json,opml,xml}
* @param array<string,mixed> $attributes
* @param array<int,mixed> $curl_options
* @return array{body:string,effective_url:string,redirect_count:int,fail:bool}
@@ -574,7 +570,7 @@ function httpGet(string $url, string $cachePath, string $type = 'html', array $a
syslog(LOG_INFO, 'FreshRSS GET ' . $type . ' ' . \SimplePie\Misc::url_remove_credentials($url));
}
- $accept = '*/*;q=0.8';
+ $accept = '';
switch ($type) {
case 'json':
$accept = 'application/json,application/feed+json,application/javascript;q=0.9,text/javascript;q=0.8,*/*;q=0.7';
@@ -585,6 +581,9 @@ function httpGet(string $url, string $cachePath, string $type = 'html', array $a
case 'xml':
$accept = 'application/xml,application/xhtml+xml,text/xml;q=0.9,*/*;q=0.8';
break;
+ case 'ico':
+ $accept = 'image/x-icon,image/vnd.microsoft.icon,image/ico,image/png,image/svg+xml,image/*;q=0.8,*/*;q=0.1';
+ break;
case 'html':
default:
$accept = 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8';
@@ -673,9 +672,13 @@ function httpGet(string $url, string $cachePath, string $type = 'html', array $a
} elseif (!is_string($body) || strlen($body) === 0) {
$body = '';
} else {
- $body = trim($body, " \n\r\t\v"); // Do not trim \x00 to avoid breaking a BOM
- if ($type !== 'json') {
+ if (in_array($type, ['html', 'json', 'opml', 'xml'], true)) {
+ $body = trim($body, " \n\r\t\v"); // Do not trim \x00 to avoid breaking a BOM
+ }
+ if (in_array($type, ['html', 'xml', 'opml'], true)) {
$body = enforceHttpEncoding($body, $c_content_type);
+ }
+ if (in_array($type, ['html'], true)) {
$body = enforceHtmlBase($body, $c_effective_url);
}
}