From 18b5c8ec6da64da90706643ffa52736a85a2ca59 Mon Sep 17 00:00:00 2001
From: Inverle <inverle@proton.me>
Date: Sun, 22 Jun 2025 00:09:18 +0200
Subject: Handle redirects when scraping feed from HTML (#7654)

* Handle redirects when scraping feed from HTML

* pass codesniffer

* pass PHPStan

* Optimize

* Another approach relying on HTML base
Standard way to save an HTML document with relative references

* Fix case of existing HTML base
which should not be overriden

---------

Co-authored-by: Alexandre Alapetite <alexandre@alapetite.fr>
---
 app/Models/Entry.php | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'app/Models/Entry.php')

diff --git a/app/Models/Entry.php b/app/Models/Entry.php
index bc5ed2279..66c05a830 100644
--- a/app/Models/Entry.php
+++ b/app/Models/Entry.php
@@ -845,7 +845,7 @@ HTML;
 	 * @param string $url Overridden URL. Will default to the entry URL.
 	 * @throws Minz_Exception
 	 */
-	public function getContentByParsing(string $url = '', int $maxRedirs = 3): string {
+	public function getContentByParsing(string $url = '', int $maxRedirs = 4): string {
 		$url = $url ?: htmlspecialchars_decode($this->link(), ENT_QUOTES);
 		$feed = $this->feed();
 		if ($url === '' || $feed === null || $feed->pathEntries() === '') {
@@ -872,12 +872,16 @@ HTML;
 		}
 
 		$cachePath = $feed->cacheFilename($url . '#' . $feed->pathEntries());
-		$html = httpGet($url, $cachePath, 'html', $feed->attributes(), $feed->curlOptions());
-		if (strlen($html) > 0) {
+		$response = httpGet($url, $cachePath, 'html', $feed->attributes(), $feed->curlOptions());
+		$html = $response['body'];
+		if ($html !== '') {
 			$doc = new DOMDocument();
 			$doc->loadHTML($html, LIBXML_NONET | LIBXML_NOERROR | LIBXML_NOWARNING);
 			$xpath = new DOMXPath($doc);
 
+			// Account for HTTP redirections
+			$url = $response['effective_url'] ?: $url;
+			$maxRedirs -= $response['redirect_count'];
 			if ($maxRedirs > 0) {
 				//Follow any HTML redirection
 				$metas = $xpath->query('//meta[@content]') ?: [];
-- 
cgit v1.2.3