From 18b5c8ec6da64da90706643ffa52736a85a2ca59 Mon Sep 17 00:00:00 2001 From: Inverle Date: Sun, 22 Jun 2025 00:09:18 +0200 Subject: Handle redirects when scraping feed from HTML (#7654) * Handle redirects when scraping feed from HTML * pass codesniffer * pass PHPStan * Optimize * Another approach relying on HTML base Standard way to save an HTML document with relative references * Fix case of existing HTML base which should not be overriden --------- Co-authored-by: Alexandre Alapetite --- app/Models/Entry.php | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'app/Models/Entry.php') diff --git a/app/Models/Entry.php b/app/Models/Entry.php index bc5ed2279..66c05a830 100644 --- a/app/Models/Entry.php +++ b/app/Models/Entry.php @@ -845,7 +845,7 @@ HTML; * @param string $url Overridden URL. Will default to the entry URL. * @throws Minz_Exception */ - public function getContentByParsing(string $url = '', int $maxRedirs = 3): string { + public function getContentByParsing(string $url = '', int $maxRedirs = 4): string { $url = $url ?: htmlspecialchars_decode($this->link(), ENT_QUOTES); $feed = $this->feed(); if ($url === '' || $feed === null || $feed->pathEntries() === '') { @@ -872,12 +872,16 @@ HTML; } $cachePath = $feed->cacheFilename($url . '#' . $feed->pathEntries()); - $html = httpGet($url, $cachePath, 'html', $feed->attributes(), $feed->curlOptions()); - if (strlen($html) > 0) { + $response = httpGet($url, $cachePath, 'html', $feed->attributes(), $feed->curlOptions()); + $html = $response['body']; + if ($html !== '') { $doc = new DOMDocument(); $doc->loadHTML($html, LIBXML_NONET | LIBXML_NOERROR | LIBXML_NOWARNING); $xpath = new DOMXPath($doc); + // Account for HTTP redirections + $url = $response['effective_url'] ?: $url; + $maxRedirs -= $response['redirect_count']; if ($maxRedirs > 0) { //Follow any HTML redirection $metas = $xpath->query('//meta[@content]') ?: []; -- cgit v1.2.3