diff options
Diffstat (limited to 'lib/lib_rss.php')
| -rw-r--r-- | lib/lib_rss.php | 127 |
1 files changed, 126 insertions, 1 deletions
diff --git a/lib/lib_rss.php b/lib/lib_rss.php index e020236ea..4e415d857 100644 --- a/lib/lib_rss.php +++ b/lib/lib_rss.php @@ -218,6 +218,7 @@ function customSimplePie($attributes = array()): SimplePie { $simplePie->set_cache_name_function('sha1'); $simplePie->set_cache_location(CACHE_PATH); $simplePie->set_cache_duration($limits['cache_duration']); + $simplePie->enable_order_by_date(false); $feed_timeout = empty($attributes['timeout']) ? 0 : intval($attributes['timeout']); $simplePie->set_timeout($feed_timeout > 0 ? $feed_timeout : $limits['timeout']); @@ -290,7 +291,10 @@ function customSimplePie($attributes = array()): SimplePie { return $simplePie; } -function sanitizeHTML($data, $base = '', $maxLength = false) { +/** + * @param int|false $maxLength + */ +function sanitizeHTML($data, string $base = '', $maxLength = false) { if (!is_string($data) || ($maxLength !== false && $maxLength <= 0)) { return ''; } @@ -311,6 +315,127 @@ function sanitizeHTML($data, $base = '', $maxLength = false) { return $result; } +function cleanCache(int $hours = 720) { + $files = glob(CACHE_PATH . '/*.{html,spc}', GLOB_BRACE | GLOB_NOSORT); + foreach ($files as $file) { + if (substr($file, -10) === 'index.html') { + continue; + } + $cacheMtime = @filemtime($file); + if ($cacheMtime !== false && $cacheMtime < time() - (3600 * $hours)) { + unlink($file); + } + } +} + +/** + * Set an XML preamble to enforce the HTML content type charset received by HTTP. + * @param string $html the row downloaded HTML content + * @param string $contentType an HTTP Content-Type such as 'text/html; charset=utf-8' + * @return string an HTML string with XML encoding information for DOMDocument::loadHTML() + */ +function enforceHttpEncoding(string $html, string $contentType = ''): string { + $httpCharset = preg_match('/\bcharset=([0-9a-z_-]{2,12})$/i', $contentType, $matches) === false ? '' : $matches[1] ?? ''; + if ($httpCharset == '') { + // No charset defined by HTTP, do nothing + return $html; + } + $httpCharsetNormalized = SimplePie_Misc::encoding($httpCharset); + if ($httpCharsetNormalized === 'windows-1252') { + // Default charset for HTTP, do nothing + return $html; + } + if (substr($html, 0, 3) === "\xEF\xBB\xBF" || // UTF-8 BOM + substr($html, 0, 2) === "\xFF\xFE" || // UTF-16 Little Endian BOM + substr($html, 0, 2) === "\xFE\xFF" || // UTF-16 Big Endian BOM + substr($html, 0, 4) === "\xFF\xFE\x00\x00" || // UTF-32 Little Endian BOM + substr($html, 0, 4) === "\x00\x00\xFE\xFF") { // UTF-32 Big Endian BOM + // Existing byte order mark, do nothing + return $html; + } + if (preg_match('/^<[?]xml[^>]+encoding\b/', substr($html, 0, 64))) { + // Existing XML declaration, do nothing + return $html; + } + return '<' . '?xml version="1.0" encoding="' . $httpCharsetNormalized . '" ?' . ">\n" . $html; +} + +/** + * @param array<string,mixed> $attributes + */ +function getHtml(string $url, array $attributes = []): string { + $limits = FreshRSS_Context::$system_conf->limits; + $feed_timeout = empty($attributes['timeout']) ? 0 : intval($attributes['timeout']); + + $cachePath = FreshRSS_Feed::cacheFilename($url, $attributes, FreshRSS_Feed::KIND_HTML_XPATH); + $cacheMtime = @filemtime($cachePath); + if ($cacheMtime !== false && $cacheMtime > time() - intval($limits['cache_duration'])) { + $html = @file_get_contents($cachePath); + if ($html != '') { + syslog(LOG_DEBUG, 'FreshRSS uses cache for ' . SimplePie_Misc::url_remove_credentials($url)); + return $html; + } + } + + if (mt_rand(0, 30) === 1) { // Remove old entries once in a while + cleanCache(); + } + + if (FreshRSS_Context::$system_conf->simplepie_syslog_enabled) { + syslog(LOG_INFO, 'FreshRSS GET ' . SimplePie_Misc::url_remove_credentials($url)); + } + + // TODO: Implement HTTP 1.1 conditional GET If-Modified-Since + $ch = curl_init(); + curl_setopt_array($ch, [ + CURLOPT_URL => $url, + CURLOPT_REFERER => SimplePie_Misc::url_remove_credentials($url), + CURLOPT_HTTPHEADER => array('Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'), + CURLOPT_USERAGENT => FRESHRSS_USERAGENT, + CURLOPT_CONNECTTIMEOUT => $feed_timeout > 0 ? $feed_timeout : $limits['timeout'], + CURLOPT_TIMEOUT => $feed_timeout > 0 ? $feed_timeout : $limits['timeout'], + //CURLOPT_FAILONERROR => true; + CURLOPT_MAXREDIRS => 4, + CURLOPT_RETURNTRANSFER => true, + CURLOPT_FOLLOWLOCATION => true, + CURLOPT_ENCODING => '', //Enable all encodings + ]); + + curl_setopt_array($ch, FreshRSS_Context::$system_conf->curl_options); + + if (isset($attributes['curl_params']) && is_array($attributes['curl_params'])) { + curl_setopt_array($ch, $attributes['curl_params']); + } + + if (isset($attributes['ssl_verify'])) { + curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, $attributes['ssl_verify'] ? 2 : 0); + curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, $attributes['ssl_verify'] ? true : false); + if (!$attributes['ssl_verify']) { + curl_setopt($ch, CURLOPT_SSL_CIPHER_LIST, 'DEFAULT@SECLEVEL=1'); + } + } + $html = curl_exec($ch); + $c_status = curl_getinfo($ch, CURLINFO_HTTP_CODE); + $c_content_type = curl_getinfo($ch, CURLINFO_CONTENT_TYPE); //TODO: Check if that may be null + $c_error = curl_error($ch); + curl_close($ch); + + if ($c_status != 200 || $c_error != '' || $html === false) { + Minz_Log::warning('Error fetching content: HTTP code ' . $c_status . ': ' . $c_error . ' ' . $url); + } + if ($html == false) { + $html = ''; + } else { + $html = enforceHttpEncoding($html, $c_content_type); + } + + if (file_put_contents($cachePath, $html) === false) { + Minz_Log::warning("Error saving cache $cachePath for $url"); + } + + return $html; +} + /** * Validate an email address, supports internationalized addresses. * |
