diff options
| author | 2025-06-22 00:09:18 +0200 | |
|---|---|---|
| committer | 2025-06-22 00:09:18 +0200 | |
| commit | 18b5c8ec6da64da90706643ffa52736a85a2ca59 (patch) | |
| tree | 0b7ac0082c0a0aa86c0f7a4caccb616011576023 /app/Models | |
| parent | a6948218fb1c66fe146c7651555e5a1f791c8112 (diff) | |
Handle redirects when scraping feed from HTML (#7654)
* Handle redirects when scraping feed from HTML
* pass codesniffer
* pass PHPStan
* Optimize
* Another approach relying on HTML base
Standard way to save an HTML document with relative references
* Fix case of existing HTML base
which should not be overriden
---------
Co-authored-by: Alexandre Alapetite <alexandre@alapetite.fr>
Diffstat (limited to 'app/Models')
| -rw-r--r-- | app/Models/Category.php | 2 | ||||
| -rw-r--r-- | app/Models/Entry.php | 10 | ||||
| -rw-r--r-- | app/Models/Feed.php | 4 |
3 files changed, 10 insertions, 6 deletions
diff --git a/app/Models/Category.php b/app/Models/Category.php index e883a99cf..554e002fb 100644 --- a/app/Models/Category.php +++ b/app/Models/Category.php @@ -188,7 +188,7 @@ class FreshRSS_Category extends Minz_Model { } $ok = true; $cachePath = $this->cacheFilename($url); - $opml = httpGet($url, $cachePath, 'opml', $this->attributes(), $this->curlOptions()); + $opml = httpGet($url, $cachePath, 'opml', $this->attributes(), $this->curlOptions())['body']; if ($opml == '') { Minz_Log::warning('Error getting dynamic OPML for category ' . $this->id() . '! ' . \SimplePie\Misc::url_remove_credentials($url)); diff --git a/app/Models/Entry.php b/app/Models/Entry.php index bc5ed2279..66c05a830 100644 --- a/app/Models/Entry.php +++ b/app/Models/Entry.php @@ -845,7 +845,7 @@ HTML; * @param string $url Overridden URL. Will default to the entry URL. * @throws Minz_Exception */ - public function getContentByParsing(string $url = '', int $maxRedirs = 3): string { + public function getContentByParsing(string $url = '', int $maxRedirs = 4): string { $url = $url ?: htmlspecialchars_decode($this->link(), ENT_QUOTES); $feed = $this->feed(); if ($url === '' || $feed === null || $feed->pathEntries() === '') { @@ -872,12 +872,16 @@ HTML; } $cachePath = $feed->cacheFilename($url . '#' . $feed->pathEntries()); - $html = httpGet($url, $cachePath, 'html', $feed->attributes(), $feed->curlOptions()); - if (strlen($html) > 0) { + $response = httpGet($url, $cachePath, 'html', $feed->attributes(), $feed->curlOptions()); + $html = $response['body']; + if ($html !== '') { $doc = new DOMDocument(); $doc->loadHTML($html, LIBXML_NONET | LIBXML_NOERROR | LIBXML_NOWARNING); $xpath = new DOMXPath($doc); + // Account for HTTP redirections + $url = $response['effective_url'] ?: $url; + $maxRedirs -= $response['redirect_count']; if ($maxRedirs > 0) { //Follow any HTML redirection $metas = $xpath->query('//meta[@content]') ?: []; diff --git a/app/Models/Feed.php b/app/Models/Feed.php index fc17c875f..3c5fed507 100644 --- a/app/Models/Feed.php +++ b/app/Models/Feed.php @@ -788,7 +788,7 @@ class FreshRSS_Feed extends Minz_Model { } $httpAccept = $this->kind() === FreshRSS_Feed::KIND_HTML_XPATH_JSON_DOTNOTATION ? 'html' : 'json'; - $content = httpGet($feedSourceUrl, $this->cacheFilename(), $httpAccept, $this->attributes(), $this->curlOptions()); + $content = httpGet($feedSourceUrl, $this->cacheFilename(), $httpAccept, $this->attributes(), $this->curlOptions())['body']; if (strlen($content) <= 0) { return null; } @@ -846,7 +846,7 @@ class FreshRSS_Feed extends Minz_Model { } $httpAccept = $this->kind() === FreshRSS_Feed::KIND_XML_XPATH ? 'xml' : 'html'; - $html = httpGet($feedSourceUrl, $this->cacheFilename(), $httpAccept, $this->attributes(), $this->curlOptions()); + $html = httpGet($feedSourceUrl, $this->cacheFilename(), $httpAccept, $this->attributes(), $this->curlOptions())['body']; if (strlen($html) <= 0) { return null; } |
