From 05ae1b0d2684cea4eda664c5ea1a995cb9f0c4b9 Mon Sep 17 00:00:00 2001 From: Alexandre Alapetite Date: Thu, 9 Feb 2023 13:57:20 +0100 Subject: XML+XPath (#5076) * XML+XPath #fix https://github.com/FreshRSS/FreshRSS/issues/5075 Implementation allowing to take an XML document as input using an XML parser (instead of an HTML parser for HTML+XPath) * Remove noise from another PR * Better MIME for XML * And add glob *.xml for cache cleaning * Minor syntax * Add glob json for clean cache --- app/Models/Feed.php | 29 ++++++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) (limited to 'app/Models/Feed.php') diff --git a/app/Models/Feed.php b/app/Models/Feed.php index f7ff76768..7c46199a5 100644 --- a/app/Models/Feed.php +++ b/app/Models/Feed.php @@ -17,6 +17,11 @@ class FreshRSS_Feed extends Minz_Model { * @var int */ const KIND_HTML_XPATH = 10; + /** + * Normal XML with XPath scraping + * @var int + */ + const KIND_XML_XPATH = 15; /** * Normal JSON with XPath scraping * @var int @@ -586,7 +591,7 @@ class FreshRSS_Feed extends Minz_Model { /** * @return SimplePie|null */ - public function loadHtmlXpath(bool $loadDetails = false, bool $noCache = false) { + public function loadHtmlXpath() { if ($this->url == '') { return null; } @@ -614,8 +619,9 @@ class FreshRSS_Feed extends Minz_Model { return null; } - $cachePath = FreshRSS_Feed::cacheFilename($feedSourceUrl, $this->attributes(), FreshRSS_Feed::KIND_HTML_XPATH); - $html = httpGet($feedSourceUrl, $cachePath, 'html', $this->attributes()); + $cachePath = FreshRSS_Feed::cacheFilename($feedSourceUrl, $this->attributes(), $this->kind()); + $html = httpGet($feedSourceUrl, $cachePath, + $this->kind() === FreshRSS_Feed::KIND_XML_XPATH ? 'xml' : 'html', $this->attributes()); if (strlen($html) <= 0) { return null; } @@ -630,7 +636,18 @@ class FreshRSS_Feed extends Minz_Model { $doc = new DOMDocument(); $doc->recover = true; $doc->strictErrorChecking = false; - $doc->loadHTML($html, LIBXML_NONET | LIBXML_NOERROR | LIBXML_NOWARNING); + + switch ($this->kind()) { + case FreshRSS_Feed::KIND_HTML_XPATH: + $doc->loadHTML($html, LIBXML_NONET | LIBXML_NOERROR | LIBXML_NOWARNING); + break; + case FreshRSS_Feed::KIND_XML_XPATH: + $doc->loadXML($html, LIBXML_NONET | LIBXML_NOERROR | LIBXML_NOWARNING); + break; + default: + return null; + } + $xpath = new DOMXPath($doc); $view->rss_title = $xPathFeedTitle == '' ? $this->name() : htmlspecialchars(@$xpath->evaluate('normalize-space(' . $xPathFeedTitle . ')'), ENT_COMPAT, 'UTF-8'); @@ -776,8 +793,10 @@ class FreshRSS_Feed extends Minz_Model { public static function cacheFilename(string $url, array $attributes, int $kind = FreshRSS_Feed::KIND_RSS): string { $simplePie = customSimplePie($attributes); $filename = $simplePie->get_cache_filename($url); - if ($kind == FreshRSS_Feed::KIND_HTML_XPATH) { + if ($kind === FreshRSS_Feed::KIND_HTML_XPATH) { return CACHE_PATH . '/' . $filename . '.html'; + } elseif ($kind === FreshRSS_Feed::KIND_XML_XPATH) { + return CACHE_PATH . '/' . $filename . '.xml'; } else { return CACHE_PATH . '/' . $filename . '.spc'; } -- cgit v1.2.3