diff options
| -rw-r--r-- | app/Controllers/extensionController.php | 2 | ||||
| -rw-r--r-- | app/Models/Category.php | 2 | ||||
| -rw-r--r-- | app/Models/Entry.php | 10 | ||||
| -rw-r--r-- | app/Models/Feed.php | 4 | ||||
| -rw-r--r-- | lib/lib_rss.php | 49 |
5 files changed, 55 insertions, 12 deletions
diff --git a/app/Controllers/extensionController.php b/app/Controllers/extensionController.php index 5afa43bd8..ef63c38d2 100644 --- a/app/Controllers/extensionController.php +++ b/app/Controllers/extensionController.php @@ -48,7 +48,7 @@ class FreshRSS_extension_Controller extends FreshRSS_ActionController { $cacheFile = CACHE_PATH . '/extension_list.json'; if (FreshRSS_Context::userConf()->retrieve_extension_list === true) { if (!file_exists($cacheFile) || (time() - (filemtime($cacheFile) ?: 0) > 86400)) { - $json = httpGet($extensionListUrl, $cacheFile, 'json'); + $json = httpGet($extensionListUrl, $cacheFile, 'json')['body']; } else { $json = @file_get_contents($cacheFile) ?: ''; } diff --git a/app/Models/Category.php b/app/Models/Category.php index e883a99cf..554e002fb 100644 --- a/app/Models/Category.php +++ b/app/Models/Category.php @@ -188,7 +188,7 @@ class FreshRSS_Category extends Minz_Model { } $ok = true; $cachePath = $this->cacheFilename($url); - $opml = httpGet($url, $cachePath, 'opml', $this->attributes(), $this->curlOptions()); + $opml = httpGet($url, $cachePath, 'opml', $this->attributes(), $this->curlOptions())['body']; if ($opml == '') { Minz_Log::warning('Error getting dynamic OPML for category ' . $this->id() . '! ' . \SimplePie\Misc::url_remove_credentials($url)); diff --git a/app/Models/Entry.php b/app/Models/Entry.php index bc5ed2279..66c05a830 100644 --- a/app/Models/Entry.php +++ b/app/Models/Entry.php @@ -845,7 +845,7 @@ HTML; * @param string $url Overridden URL. Will default to the entry URL. * @throws Minz_Exception */ - public function getContentByParsing(string $url = '', int $maxRedirs = 3): string { + public function getContentByParsing(string $url = '', int $maxRedirs = 4): string { $url = $url ?: htmlspecialchars_decode($this->link(), ENT_QUOTES); $feed = $this->feed(); if ($url === '' || $feed === null || $feed->pathEntries() === '') { @@ -872,12 +872,16 @@ HTML; } $cachePath = $feed->cacheFilename($url . '#' . $feed->pathEntries()); - $html = httpGet($url, $cachePath, 'html', $feed->attributes(), $feed->curlOptions()); - if (strlen($html) > 0) { + $response = httpGet($url, $cachePath, 'html', $feed->attributes(), $feed->curlOptions()); + $html = $response['body']; + if ($html !== '') { $doc = new DOMDocument(); $doc->loadHTML($html, LIBXML_NONET | LIBXML_NOERROR | LIBXML_NOWARNING); $xpath = new DOMXPath($doc); + // Account for HTTP redirections + $url = $response['effective_url'] ?: $url; + $maxRedirs -= $response['redirect_count']; if ($maxRedirs > 0) { //Follow any HTML redirection $metas = $xpath->query('//meta[@content]') ?: []; diff --git a/app/Models/Feed.php b/app/Models/Feed.php index fc17c875f..3c5fed507 100644 --- a/app/Models/Feed.php +++ b/app/Models/Feed.php @@ -788,7 +788,7 @@ class FreshRSS_Feed extends Minz_Model { } $httpAccept = $this->kind() === FreshRSS_Feed::KIND_HTML_XPATH_JSON_DOTNOTATION ? 'html' : 'json'; - $content = httpGet($feedSourceUrl, $this->cacheFilename(), $httpAccept, $this->attributes(), $this->curlOptions()); + $content = httpGet($feedSourceUrl, $this->cacheFilename(), $httpAccept, $this->attributes(), $this->curlOptions())['body']; if (strlen($content) <= 0) { return null; } @@ -846,7 +846,7 @@ class FreshRSS_Feed extends Minz_Model { } $httpAccept = $this->kind() === FreshRSS_Feed::KIND_XML_XPATH ? 'xml' : 'html'; - $html = httpGet($feedSourceUrl, $this->cacheFilename(), $httpAccept, $this->attributes(), $this->curlOptions()); + $html = httpGet($feedSourceUrl, $this->cacheFilename(), $httpAccept, $this->attributes(), $this->curlOptions())['body']; if (strlen($html) <= 0) { return null; } diff --git a/lib/lib_rss.php b/lib/lib_rss.php index b8c6bc3cd..9d315a26d 100644 --- a/lib/lib_rss.php +++ b/lib/lib_rss.php @@ -508,11 +508,46 @@ function enforceHttpEncoding(string $html, string $contentType = ''): string { } /** + * Set an HTML base URL to the HTML content if there is none. + * @param string $html the raw downloaded HTML content + * @param string $href the HTML base URL + * @return string an HTML string + */ +function enforceHtmlBase(string $html, string $href): string { + $doc = new DOMDocument(); + $doc->loadHTML($html, LIBXML_NONET | LIBXML_NOERROR | LIBXML_NOWARNING); + if ($doc->documentElement === null) { + return ''; + } + $xpath = new DOMXPath($doc); + $bases = $xpath->evaluate('//base'); + if (!($bases instanceof DOMNodeList) || $bases->length === 0) { + $base = $doc->createElement('base'); + if ($base === false) { + return $html; + } + $base->setAttribute('href', $href); + $head = null; + $heads = $xpath->evaluate('//head'); + if ($heads instanceof DOMNodeList && $heads->length > 0) { + $head = $heads->item(0); + } + if ($head instanceof DOMElement) { + $head->insertBefore($base, $head->firstChild); + } else { + $doc->insertBefore($base, $doc->documentElement->firstChild); + } + } + return $doc->saveHTML() ?: $html; +} + +/** * @param string $type {html,json,opml,xml} * @param array<string,mixed> $attributes * @param array<int,mixed> $curl_options + * @return array{body:string,effective_url:string,redirect_count:int,fail:bool} */ -function httpGet(string $url, string $cachePath, string $type = 'html', array $attributes = [], array $curl_options = []): string { +function httpGet(string $url, string $cachePath, string $type = 'html', array $attributes = [], array $curl_options = []): array { $limits = FreshRSS_Context::systemConf()->limits; $feed_timeout = empty($attributes['timeout']) || !is_numeric($attributes['timeout']) ? 0 : intval($attributes['timeout']); @@ -521,7 +556,7 @@ function httpGet(string $url, string $cachePath, string $type = 'html', array $a $body = @file_get_contents($cachePath); if ($body != false) { syslog(LOG_DEBUG, 'FreshRSS uses cache for ' . \SimplePie\Misc::url_remove_credentials($url)); - return $body; + return ['body' => $body, 'effective_url' => $url, 'redirect_count' => 0, 'fail' => false]; } } @@ -553,7 +588,7 @@ function httpGet(string $url, string $cachePath, string $type = 'html', array $a // TODO: Implement HTTP 1.1 conditional GET If-Modified-Since $ch = curl_init(); if ($ch === false) { - return ''; + return ['body' => '', 'effective_url' => '', 'redirect_count' => 0, 'fail' => true]; } curl_setopt_array($ch, [ CURLOPT_URL => $url, @@ -598,10 +633,13 @@ function httpGet(string $url, string $cachePath, string $type = 'html', array $a $body = curl_exec($ch); $c_status = curl_getinfo($ch, CURLINFO_HTTP_CODE); $c_content_type = '' . curl_getinfo($ch, CURLINFO_CONTENT_TYPE); + $c_effective_url = curl_getinfo($ch, CURLINFO_EFFECTIVE_URL); + $c_redirect_count = curl_getinfo($ch, CURLINFO_REDIRECT_COUNT); $c_error = curl_error($ch); curl_close($ch); - if ($c_status != 200 || $c_error != '' || $body === false) { + $fail = $c_status != 200 || $c_error != '' || $body === false; + if ($fail) { Minz_Log::warning('Error fetching content: HTTP code ' . $c_status . ': ' . $c_error . ' ' . $url); $body = ''; // TODO: Implement HTTP 410 Gone @@ -611,6 +649,7 @@ function httpGet(string $url, string $cachePath, string $type = 'html', array $a $body = trim($body, " \n\r\t\v"); // Do not trim \x00 to avoid breaking a BOM if ($type !== 'json') { $body = enforceHttpEncoding($body, $c_content_type); + $body = enforceHtmlBase($body, $c_effective_url); } } @@ -618,7 +657,7 @@ function httpGet(string $url, string $cachePath, string $type = 'html', array $a Minz_Log::warning("Error saving cache $cachePath for $url"); } - return $body; + return ['body' => $body, 'effective_url' => $c_effective_url, 'redirect_count' => $c_redirect_count, 'fail' => $fail]; } /** |
