aboutsummaryrefslogtreecommitdiff
path: root/lib/lib_rss.php
diff options
context:
space:
mode:
Diffstat (limited to 'lib/lib_rss.php')
-rw-r--r--lib/lib_rss.php127
1 files changed, 126 insertions, 1 deletions
diff --git a/lib/lib_rss.php b/lib/lib_rss.php
index e020236ea..4e415d857 100644
--- a/lib/lib_rss.php
+++ b/lib/lib_rss.php
@@ -218,6 +218,7 @@ function customSimplePie($attributes = array()): SimplePie {
$simplePie->set_cache_name_function('sha1');
$simplePie->set_cache_location(CACHE_PATH);
$simplePie->set_cache_duration($limits['cache_duration']);
+ $simplePie->enable_order_by_date(false);
$feed_timeout = empty($attributes['timeout']) ? 0 : intval($attributes['timeout']);
$simplePie->set_timeout($feed_timeout > 0 ? $feed_timeout : $limits['timeout']);
@@ -290,7 +291,10 @@ function customSimplePie($attributes = array()): SimplePie {
return $simplePie;
}
-function sanitizeHTML($data, $base = '', $maxLength = false) {
+/**
+ * @param int|false $maxLength
+ */
+function sanitizeHTML($data, string $base = '', $maxLength = false) {
if (!is_string($data) || ($maxLength !== false && $maxLength <= 0)) {
return '';
}
@@ -311,6 +315,127 @@ function sanitizeHTML($data, $base = '', $maxLength = false) {
return $result;
}
+function cleanCache(int $hours = 720) {
+ $files = glob(CACHE_PATH . '/*.{html,spc}', GLOB_BRACE | GLOB_NOSORT);
+ foreach ($files as $file) {
+ if (substr($file, -10) === 'index.html') {
+ continue;
+ }
+ $cacheMtime = @filemtime($file);
+ if ($cacheMtime !== false && $cacheMtime < time() - (3600 * $hours)) {
+ unlink($file);
+ }
+ }
+}
+
+/**
+ * Set an XML preamble to enforce the HTML content type charset received by HTTP.
+ * @param string $html the row downloaded HTML content
+ * @param string $contentType an HTTP Content-Type such as 'text/html; charset=utf-8'
+ * @return string an HTML string with XML encoding information for DOMDocument::loadHTML()
+ */
+function enforceHttpEncoding(string $html, string $contentType = ''): string {
+ $httpCharset = preg_match('/\bcharset=([0-9a-z_-]{2,12})$/i', $contentType, $matches) === false ? '' : $matches[1] ?? '';
+ if ($httpCharset == '') {
+ // No charset defined by HTTP, do nothing
+ return $html;
+ }
+ $httpCharsetNormalized = SimplePie_Misc::encoding($httpCharset);
+ if ($httpCharsetNormalized === 'windows-1252') {
+ // Default charset for HTTP, do nothing
+ return $html;
+ }
+ if (substr($html, 0, 3) === "\xEF\xBB\xBF" || // UTF-8 BOM
+ substr($html, 0, 2) === "\xFF\xFE" || // UTF-16 Little Endian BOM
+ substr($html, 0, 2) === "\xFE\xFF" || // UTF-16 Big Endian BOM
+ substr($html, 0, 4) === "\xFF\xFE\x00\x00" || // UTF-32 Little Endian BOM
+ substr($html, 0, 4) === "\x00\x00\xFE\xFF") { // UTF-32 Big Endian BOM
+ // Existing byte order mark, do nothing
+ return $html;
+ }
+ if (preg_match('/^<[?]xml[^>]+encoding\b/', substr($html, 0, 64))) {
+ // Existing XML declaration, do nothing
+ return $html;
+ }
+ return '<' . '?xml version="1.0" encoding="' . $httpCharsetNormalized . '" ?' . ">\n" . $html;
+}
+
+/**
+ * @param array<string,mixed> $attributes
+ */
+function getHtml(string $url, array $attributes = []): string {
+ $limits = FreshRSS_Context::$system_conf->limits;
+ $feed_timeout = empty($attributes['timeout']) ? 0 : intval($attributes['timeout']);
+
+ $cachePath = FreshRSS_Feed::cacheFilename($url, $attributes, FreshRSS_Feed::KIND_HTML_XPATH);
+ $cacheMtime = @filemtime($cachePath);
+ if ($cacheMtime !== false && $cacheMtime > time() - intval($limits['cache_duration'])) {
+ $html = @file_get_contents($cachePath);
+ if ($html != '') {
+ syslog(LOG_DEBUG, 'FreshRSS uses cache for ' . SimplePie_Misc::url_remove_credentials($url));
+ return $html;
+ }
+ }
+
+ if (mt_rand(0, 30) === 1) { // Remove old entries once in a while
+ cleanCache();
+ }
+
+ if (FreshRSS_Context::$system_conf->simplepie_syslog_enabled) {
+ syslog(LOG_INFO, 'FreshRSS GET ' . SimplePie_Misc::url_remove_credentials($url));
+ }
+
+ // TODO: Implement HTTP 1.1 conditional GET If-Modified-Since
+ $ch = curl_init();
+ curl_setopt_array($ch, [
+ CURLOPT_URL => $url,
+ CURLOPT_REFERER => SimplePie_Misc::url_remove_credentials($url),
+ CURLOPT_HTTPHEADER => array('Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'),
+ CURLOPT_USERAGENT => FRESHRSS_USERAGENT,
+ CURLOPT_CONNECTTIMEOUT => $feed_timeout > 0 ? $feed_timeout : $limits['timeout'],
+ CURLOPT_TIMEOUT => $feed_timeout > 0 ? $feed_timeout : $limits['timeout'],
+ //CURLOPT_FAILONERROR => true;
+ CURLOPT_MAXREDIRS => 4,
+ CURLOPT_RETURNTRANSFER => true,
+ CURLOPT_FOLLOWLOCATION => true,
+ CURLOPT_ENCODING => '', //Enable all encodings
+ ]);
+
+ curl_setopt_array($ch, FreshRSS_Context::$system_conf->curl_options);
+
+ if (isset($attributes['curl_params']) && is_array($attributes['curl_params'])) {
+ curl_setopt_array($ch, $attributes['curl_params']);
+ }
+
+ if (isset($attributes['ssl_verify'])) {
+ curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, $attributes['ssl_verify'] ? 2 : 0);
+ curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, $attributes['ssl_verify'] ? true : false);
+ if (!$attributes['ssl_verify']) {
+ curl_setopt($ch, CURLOPT_SSL_CIPHER_LIST, 'DEFAULT@SECLEVEL=1');
+ }
+ }
+ $html = curl_exec($ch);
+ $c_status = curl_getinfo($ch, CURLINFO_HTTP_CODE);
+ $c_content_type = curl_getinfo($ch, CURLINFO_CONTENT_TYPE); //TODO: Check if that may be null
+ $c_error = curl_error($ch);
+ curl_close($ch);
+
+ if ($c_status != 200 || $c_error != '' || $html === false) {
+ Minz_Log::warning('Error fetching content: HTTP code ' . $c_status . ': ' . $c_error . ' ' . $url);
+ }
+ if ($html == false) {
+ $html = '';
+ } else {
+ $html = enforceHttpEncoding($html, $c_content_type);
+ }
+
+ if (file_put_contents($cachePath, $html) === false) {
+ Minz_Log::warning("Error saving cache $cachePath for $url");
+ }
+
+ return $html;
+}
+
/**
* Validate an email address, supports internationalized addresses.
*