From 05ae1b0d2684cea4eda664c5ea1a995cb9f0c4b9 Mon Sep 17 00:00:00 2001 From: Alexandre Alapetite Date: Thu, 9 Feb 2023 13:57:20 +0100 Subject: XML+XPath (#5076) * XML+XPath #fix https://github.com/FreshRSS/FreshRSS/issues/5075 Implementation allowing to take an XML document as input using an XML parser (instead of an HTML parser for HTML+XPath) * Remove noise from another PR * Better MIME for XML * And add glob *.xml for cache cleaning * Minor syntax * Add glob json for clean cache --- app/Controllers/feedController.php | 14 ++++++++++---- app/Controllers/subscriptionController.php | 2 +- app/Models/Feed.php | 29 ++++++++++++++++++++++++----- app/Services/ExportService.php | 1 + app/Services/ImportService.php | 5 ++++- app/i18n/cz/sub.php | 1 + app/i18n/de/sub.php | 1 + app/i18n/el/sub.php | 1 + app/i18n/en-us/sub.php | 1 + app/i18n/en/sub.php | 1 + app/i18n/es/sub.php | 1 + app/i18n/fr/sub.php | 1 + app/i18n/he/sub.php | 1 + app/i18n/id/sub.php | 1 + app/i18n/it/sub.php | 1 + app/i18n/ja/sub.php | 1 + app/i18n/ko/sub.php | 1 + app/i18n/nl/sub.php | 1 + app/i18n/oc/sub.php | 1 + app/i18n/pl/sub.php | 1 + app/i18n/pt-br/sub.php | 1 + app/i18n/ru/sub.php | 1 + app/i18n/sk/sub.php | 1 + app/i18n/tr/sub.php | 1 + app/i18n/zh-cn/sub.php | 1 + app/i18n/zh-tw/sub.php | 1 + app/views/helpers/export/opml.phtml | 11 +++++++++-- app/views/helpers/feed/update.phtml | 5 +++-- app/views/subscription/add.phtml | 1 + docs/en/developers/OPML.md | 4 +++- lib/lib_rss.php | 14 ++++++++++++-- p/scripts/feed.js | 11 +++++++++-- 32 files changed, 98 insertions(+), 20 deletions(-) diff --git a/app/Controllers/feedController.php b/app/Controllers/feedController.php index 2bef85f0e..84f38fe5e 100644 --- a/app/Controllers/feedController.php +++ b/app/Controllers/feedController.php @@ -81,6 +81,7 @@ class FreshRSS_feed_Controller extends FreshRSS_ActionController { $feed->load(true); //Throws FreshRSS_Feed_Exception, Minz_FileNotExistException break; case FreshRSS_Feed::KIND_HTML_XPATH: + case FreshRSS_Feed::KIND_XML_XPATH: $feed->_website($url); break; } @@ -201,8 +202,8 @@ class FreshRSS_feed_Controller extends FreshRSS_ActionController { $timeout = intval(Minz_Request::param('timeout', 0)); $attributes['timeout'] = $timeout > 0 ? $timeout : null; - $feed_kind = Minz_Request::param('feed_kind', FreshRSS_Feed::KIND_RSS); - if ($feed_kind == FreshRSS_Feed::KIND_HTML_XPATH) { + $feed_kind = (int)Minz_Request::param('feed_kind', FreshRSS_Feed::KIND_RSS); + if ($feed_kind === FreshRSS_Feed::KIND_HTML_XPATH || $feed_kind === FreshRSS_Feed::KIND_XML_XPATH) { $xPathSettings = []; if (Minz_Request::param('xPathFeedTitle', '') != '') $xPathSettings['feedTitle'] = Minz_Request::param('xPathFeedTitle', '', true); if (Minz_Request::param('xPathItem', '') != '') $xPathSettings['item'] = Minz_Request::param('xPathItem', '', true); @@ -385,10 +386,15 @@ class FreshRSS_feed_Controller extends FreshRSS_ActionController { if ($simplePiePush) { $simplePie = $simplePiePush; //Used by WebSub } elseif ($feed->kind() === FreshRSS_Feed::KIND_HTML_XPATH) { - $simplePie = $feed->loadHtmlXpath(false, $isNewFeed); - if ($simplePie == null) { + $simplePie = $feed->loadHtmlXpath(); + if ($simplePie === null) { throw new FreshRSS_Feed_Exception('HTML+XPath Web scraping failed for [' . $feed->url(false) . ']'); } + } elseif ($feed->kind() === FreshRSS_Feed::KIND_XML_XPATH) { + $simplePie = $feed->loadHtmlXpath(); + if ($simplePie === null) { + throw new FreshRSS_Feed_Exception('XML+XPath parsing failed for [' . $feed->url(false) . ']'); + } } else { $simplePie = $feed->load(false, $isNewFeed); } diff --git a/app/Controllers/subscriptionController.php b/app/Controllers/subscriptionController.php index b2ee046d9..f0355a82a 100644 --- a/app/Controllers/subscriptionController.php +++ b/app/Controllers/subscriptionController.php @@ -203,7 +203,7 @@ class FreshRSS_subscription_Controller extends FreshRSS_ActionController { $feed->_filtersAction('read', preg_split('/[\n\r]+/', Minz_Request::param('filteractions_read', ''))); $feed->_kind(intval(Minz_Request::param('feed_kind', FreshRSS_Feed::KIND_RSS))); - if ($feed->kind() == FreshRSS_Feed::KIND_HTML_XPATH) { + if ($feed->kind() === FreshRSS_Feed::KIND_HTML_XPATH || $feed->kind() === FreshRSS_Feed::KIND_XML_XPATH) { $xPathSettings = []; if (Minz_Request::param('xPathItem', '') != '') $xPathSettings['item'] = Minz_Request::param('xPathItem', '', true); if (Minz_Request::param('xPathItemTitle', '') != '') $xPathSettings['itemTitle'] = Minz_Request::param('xPathItemTitle', '', true); diff --git a/app/Models/Feed.php b/app/Models/Feed.php index f7ff76768..7c46199a5 100644 --- a/app/Models/Feed.php +++ b/app/Models/Feed.php @@ -17,6 +17,11 @@ class FreshRSS_Feed extends Minz_Model { * @var int */ const KIND_HTML_XPATH = 10; + /** + * Normal XML with XPath scraping + * @var int + */ + const KIND_XML_XPATH = 15; /** * Normal JSON with XPath scraping * @var int @@ -586,7 +591,7 @@ class FreshRSS_Feed extends Minz_Model { /** * @return SimplePie|null */ - public function loadHtmlXpath(bool $loadDetails = false, bool $noCache = false) { + public function loadHtmlXpath() { if ($this->url == '') { return null; } @@ -614,8 +619,9 @@ class FreshRSS_Feed extends Minz_Model { return null; } - $cachePath = FreshRSS_Feed::cacheFilename($feedSourceUrl, $this->attributes(), FreshRSS_Feed::KIND_HTML_XPATH); - $html = httpGet($feedSourceUrl, $cachePath, 'html', $this->attributes()); + $cachePath = FreshRSS_Feed::cacheFilename($feedSourceUrl, $this->attributes(), $this->kind()); + $html = httpGet($feedSourceUrl, $cachePath, + $this->kind() === FreshRSS_Feed::KIND_XML_XPATH ? 'xml' : 'html', $this->attributes()); if (strlen($html) <= 0) { return null; } @@ -630,7 +636,18 @@ class FreshRSS_Feed extends Minz_Model { $doc = new DOMDocument(); $doc->recover = true; $doc->strictErrorChecking = false; - $doc->loadHTML($html, LIBXML_NONET | LIBXML_NOERROR | LIBXML_NOWARNING); + + switch ($this->kind()) { + case FreshRSS_Feed::KIND_HTML_XPATH: + $doc->loadHTML($html, LIBXML_NONET | LIBXML_NOERROR | LIBXML_NOWARNING); + break; + case FreshRSS_Feed::KIND_XML_XPATH: + $doc->loadXML($html, LIBXML_NONET | LIBXML_NOERROR | LIBXML_NOWARNING); + break; + default: + return null; + } + $xpath = new DOMXPath($doc); $view->rss_title = $xPathFeedTitle == '' ? $this->name() : htmlspecialchars(@$xpath->evaluate('normalize-space(' . $xPathFeedTitle . ')'), ENT_COMPAT, 'UTF-8'); @@ -776,8 +793,10 @@ class FreshRSS_Feed extends Minz_Model { public static function cacheFilename(string $url, array $attributes, int $kind = FreshRSS_Feed::KIND_RSS): string { $simplePie = customSimplePie($attributes); $filename = $simplePie->get_cache_filename($url); - if ($kind == FreshRSS_Feed::KIND_HTML_XPATH) { + if ($kind === FreshRSS_Feed::KIND_HTML_XPATH) { return CACHE_PATH . '/' . $filename . '.html'; + } elseif ($kind === FreshRSS_Feed::KIND_XML_XPATH) { + return CACHE_PATH . '/' . $filename . '.xml'; } else { return CACHE_PATH . '/' . $filename . '.spc'; } diff --git a/app/Services/ExportService.php b/app/Services/ExportService.php index 2f35666a8..6b0a3f178 100644 --- a/app/Services/ExportService.php +++ b/app/Services/ExportService.php @@ -21,6 +21,7 @@ class FreshRSS_Export_Service { const FRSS_NAMESPACE = 'https://freshrss.org/opml'; const TYPE_HTML_XPATH = 'HTML+XPath'; + const TYPE_XML_XPATH = 'XML+XPath'; const TYPE_RSS_ATOM = 'rss'; /** diff --git a/app/Services/ImportService.php b/app/Services/ImportService.php index 68aa6f741..55aa28679 100644 --- a/app/Services/ImportService.php +++ b/app/Services/ImportService.php @@ -160,10 +160,13 @@ class FreshRSS_Import_Service { $feed->_website($website); $feed->_description($description); - switch ($feed_elt['type'] ?? '') { + switch (strtolower($feed_elt['type'] ?? '')) { case strtolower(FreshRSS_Export_Service::TYPE_HTML_XPATH): $feed->_kind(FreshRSS_Feed::KIND_HTML_XPATH); break; + case strtolower(FreshRSS_Export_Service::TYPE_XML_XPATH): + $feed->_kind(FreshRSS_Feed::KIND_XML_XPATH); + break; case strtolower(FreshRSS_Export_Service::TYPE_RSS_ATOM): default: $feed->_kind(FreshRSS_Feed::KIND_RSS); diff --git a/app/i18n/cz/sub.php b/app/i18n/cz/sub.php index a11a9359d..3d08c315b 100644 --- a/app/i18n/cz/sub.php +++ b/app/i18n/cz/sub.php @@ -122,6 +122,7 @@ return array( 'xpath' => 'XPath pro:', ), 'rss' => 'RSS / Atom (výchozí)', + 'xml_xpath' => 'XML + XPath', // TODO ), 'maintenance' => array( 'clear_cache' => 'Vymazat mezipaměť', diff --git a/app/i18n/de/sub.php b/app/i18n/de/sub.php index 580f7d348..b265c1b98 100644 --- a/app/i18n/de/sub.php +++ b/app/i18n/de/sub.php @@ -122,6 +122,7 @@ return array( 'xpath' => 'XPath für:', ), 'rss' => 'RSS / Atom (Standard)', + 'xml_xpath' => 'XML + XPath', // TODO ), 'maintenance' => array( 'clear_cache' => 'Zwischenspeicher leeren', diff --git a/app/i18n/el/sub.php b/app/i18n/el/sub.php index 424fafc7b..aae9ae412 100644 --- a/app/i18n/el/sub.php +++ b/app/i18n/el/sub.php @@ -122,6 +122,7 @@ return array( 'xpath' => 'XPath for:', // TODO ), 'rss' => 'RSS / Atom (default)', // TODO + 'xml_xpath' => 'XML + XPath', // TODO ), 'maintenance' => array( 'clear_cache' => 'Clear cache', // TODO diff --git a/app/i18n/en-us/sub.php b/app/i18n/en-us/sub.php index a6b311084..92d75b81e 100644 --- a/app/i18n/en-us/sub.php +++ b/app/i18n/en-us/sub.php @@ -122,6 +122,7 @@ return array( 'xpath' => 'XPath for:', // IGNORE ), 'rss' => 'RSS / Atom (default)', // IGNORE + 'xml_xpath' => 'XML + XPath', // IGNORE ), 'maintenance' => array( 'clear_cache' => 'Clear cache', // IGNORE diff --git a/app/i18n/en/sub.php b/app/i18n/en/sub.php index c7e100c25..04caaff05 100644 --- a/app/i18n/en/sub.php +++ b/app/i18n/en/sub.php @@ -122,6 +122,7 @@ return array( 'xpath' => 'XPath for:', ), 'rss' => 'RSS / Atom (default)', + 'xml_xpath' => 'XML + XPath', // TODO ), 'maintenance' => array( 'clear_cache' => 'Clear cache', diff --git a/app/i18n/es/sub.php b/app/i18n/es/sub.php index 52d681067..4fd2fa393 100644 --- a/app/i18n/es/sub.php +++ b/app/i18n/es/sub.php @@ -122,6 +122,7 @@ return array( 'xpath' => 'XPath para:', ), 'rss' => 'RSS / Atom (por defecto)', + 'xml_xpath' => 'XML + XPath', // TODO ), 'maintenance' => array( 'clear_cache' => 'Borrar caché', diff --git a/app/i18n/fr/sub.php b/app/i18n/fr/sub.php index f9df0dbcc..be6dc094d 100644 --- a/app/i18n/fr/sub.php +++ b/app/i18n/fr/sub.php @@ -122,6 +122,7 @@ return array( 'xpath' => 'XPath pour :', ), 'rss' => 'RSS / Atom (par défaut)', + 'xml_xpath' => 'XML + XPath', // IGNORE ), 'maintenance' => array( 'clear_cache' => 'Vider le cache', diff --git a/app/i18n/he/sub.php b/app/i18n/he/sub.php index 25552ffa1..bae5f5177 100644 --- a/app/i18n/he/sub.php +++ b/app/i18n/he/sub.php @@ -122,6 +122,7 @@ return array( 'xpath' => 'XPath for:', // TODO ), 'rss' => 'RSS / Atom (default)', // TODO + 'xml_xpath' => 'XML + XPath', // TODO ), 'maintenance' => array( 'clear_cache' => 'Clear cache', // TODO diff --git a/app/i18n/id/sub.php b/app/i18n/id/sub.php index 7fdf5c024..3f9a4916a 100644 --- a/app/i18n/id/sub.php +++ b/app/i18n/id/sub.php @@ -122,6 +122,7 @@ return array( 'xpath' => 'XPath for:', // TODO ), 'rss' => 'RSS / Atom (default)', // TODO + 'xml_xpath' => 'XML + XPath', // TODO ), 'maintenance' => array( 'clear_cache' => 'Clear cache', // TODO diff --git a/app/i18n/it/sub.php b/app/i18n/it/sub.php index 8614caca7..7ab83cf07 100644 --- a/app/i18n/it/sub.php +++ b/app/i18n/it/sub.php @@ -122,6 +122,7 @@ return array( 'xpath' => 'XPath per:', ), 'rss' => 'RSS / Atom (predefinito)', + 'xml_xpath' => 'XML + XPath', // TODO ), 'maintenance' => array( 'clear_cache' => 'Svuota cache', diff --git a/app/i18n/ja/sub.php b/app/i18n/ja/sub.php index 80548c025..2425b21f3 100644 --- a/app/i18n/ja/sub.php +++ b/app/i18n/ja/sub.php @@ -122,6 +122,7 @@ return array( 'xpath' => 'XPathは:', ), 'rss' => 'RSS / Atom (標準)', + 'xml_xpath' => 'XML + XPath', // TODO ), 'maintenance' => array( 'clear_cache' => 'キャッシュのクリア', diff --git a/app/i18n/ko/sub.php b/app/i18n/ko/sub.php index e0ef5990b..f376247d5 100644 --- a/app/i18n/ko/sub.php +++ b/app/i18n/ko/sub.php @@ -122,6 +122,7 @@ return array( 'xpath' => '다음의 XPath:', ), 'rss' => 'RSS / Atom (기본값)', + 'xml_xpath' => 'XML + XPath', // TODO ), 'maintenance' => array( 'clear_cache' => '캐쉬 지우기', diff --git a/app/i18n/nl/sub.php b/app/i18n/nl/sub.php index 0fa767171..631da9477 100644 --- a/app/i18n/nl/sub.php +++ b/app/i18n/nl/sub.php @@ -122,6 +122,7 @@ return array( 'xpath' => 'XPath voor:', ), 'rss' => 'RSS / Atom (standaard)', + 'xml_xpath' => 'XML + XPath', // TODO ), 'maintenance' => array( 'clear_cache' => 'Cache leegmaken', diff --git a/app/i18n/oc/sub.php b/app/i18n/oc/sub.php index 92a73057c..008b4964d 100644 --- a/app/i18n/oc/sub.php +++ b/app/i18n/oc/sub.php @@ -122,6 +122,7 @@ return array( 'xpath' => 'XPath per :', ), 'rss' => 'RSS / Atom (defaut)', + 'xml_xpath' => 'XML + XPath', // TODO ), 'maintenance' => array( 'clear_cache' => 'Escafar lo cache', diff --git a/app/i18n/pl/sub.php b/app/i18n/pl/sub.php index b6121fcb7..565401982 100644 --- a/app/i18n/pl/sub.php +++ b/app/i18n/pl/sub.php @@ -122,6 +122,7 @@ return array( 'xpath' => 'XPath dla:', ), 'rss' => 'RSS / Atom (domyślne)', + 'xml_xpath' => 'XML + XPath', // TODO ), 'maintenance' => array( 'clear_cache' => 'Wyczyść pamięć podręczną', diff --git a/app/i18n/pt-br/sub.php b/app/i18n/pt-br/sub.php index c9755755e..4cdee8681 100644 --- a/app/i18n/pt-br/sub.php +++ b/app/i18n/pt-br/sub.php @@ -122,6 +122,7 @@ return array( 'xpath' => 'XPath para:', ), 'rss' => 'RSS / Atom (padrão)', + 'xml_xpath' => 'XML + XPath', // TODO ), 'maintenance' => array( 'clear_cache' => 'Limpar o cache', diff --git a/app/i18n/ru/sub.php b/app/i18n/ru/sub.php index 5704b53b1..d13c4c4f0 100644 --- a/app/i18n/ru/sub.php +++ b/app/i18n/ru/sub.php @@ -122,6 +122,7 @@ return array( 'xpath' => 'XPath для:', ), 'rss' => 'RSS / Atom (по умолчанию)', + 'xml_xpath' => 'XML + XPath', // TODO ), 'maintenance' => array( 'clear_cache' => 'Очистить кэш', diff --git a/app/i18n/sk/sub.php b/app/i18n/sk/sub.php index f583f6ca0..3c980d202 100644 --- a/app/i18n/sk/sub.php +++ b/app/i18n/sk/sub.php @@ -122,6 +122,7 @@ return array( 'xpath' => 'XPath pre:', ), 'rss' => 'RSS / Atom (prednastavené)', + 'xml_xpath' => 'XML + XPath', // TODO ), 'maintenance' => array( 'clear_cache' => 'Vymazať vyrovnáciu pamäť', diff --git a/app/i18n/tr/sub.php b/app/i18n/tr/sub.php index 056c059ac..3e03f667c 100644 --- a/app/i18n/tr/sub.php +++ b/app/i18n/tr/sub.php @@ -122,6 +122,7 @@ return array( 'xpath' => 'XPath:', ), 'rss' => 'RSS / Atom (varsayılan)', + 'xml_xpath' => 'XML + XPath', // TODO ), 'maintenance' => array( 'clear_cache' => 'Önbelleği temizle', diff --git a/app/i18n/zh-cn/sub.php b/app/i18n/zh-cn/sub.php index 2f9d17ace..5e6e570a9 100644 --- a/app/i18n/zh-cn/sub.php +++ b/app/i18n/zh-cn/sub.php @@ -122,6 +122,7 @@ return array( 'xpath' => 'XPath 定位:', ), 'rss' => 'RSS / Atom (默认)', + 'xml_xpath' => 'XML + XPath', // TODO ), 'maintenance' => array( 'clear_cache' => '清理缓存', diff --git a/app/i18n/zh-tw/sub.php b/app/i18n/zh-tw/sub.php index dddcb2661..8a255645d 100644 --- a/app/i18n/zh-tw/sub.php +++ b/app/i18n/zh-tw/sub.php @@ -122,6 +122,7 @@ return array( 'xpath' => 'XPath 定位:', ), 'rss' => 'RSS / Atom (默認)', + 'xml_xpath' => 'XML + XPath', // TODO ), 'maintenance' => array( 'clear_cache' => '清理暫存', diff --git a/app/views/helpers/export/opml.phtml b/app/views/helpers/export/opml.phtml index eb6f7523b..64c83c960 100644 --- a/app/views/helpers/export/opml.phtml +++ b/app/views/helpers/export/opml.phtml @@ -18,8 +18,15 @@ function feedsToOutlines($feeds, $excludeMutedFeeds = false): array { 'description' => htmlspecialchars_decode($feed->description(), ENT_QUOTES), ]; - if ($feed->kind() === FreshRSS_Feed::KIND_HTML_XPATH) { - $outline['type'] = FreshRSS_Export_Service::TYPE_HTML_XPATH; + if ($feed->kind() === FreshRSS_Feed::KIND_HTML_XPATH || $feed->kind() === FreshRSS_Feed::KIND_XML_XPATH) { + switch ($feed->kind()) { + case FreshRSS_Feed::KIND_HTML_XPATH: + $outline['type'] = FreshRSS_Export_Service::TYPE_HTML_XPATH; + break; + case FreshRSS_Feed::KIND_XML_XPATH: + $outline['type'] = FreshRSS_Export_Service::TYPE_XML_XPATH; + break; + } /** @var array */ $xPathSettings = $feed->attributes('xpath'); $outline['frss:xPathItem'] = $xPathSettings['item'] ?? null; diff --git a/app/views/helpers/feed/update.phtml b/app/views/helpers/feed/update.phtml index 5b958451d..0cd2ec0c3 100644 --- a/app/views/helpers/feed/update.phtml +++ b/app/views/helpers/feed/update.phtml @@ -391,8 +391,9 @@
diff --git a/app/views/subscription/add.phtml b/app/views/subscription/add.phtml index 7fa59e751..4e9da877f 100644 --- a/app/views/subscription/add.phtml +++ b/app/views/subscription/add.phtml @@ -70,6 +70,7 @@ diff --git a/docs/en/developers/OPML.md b/docs/en/developers/OPML.md index 2190a1de3..f65fd2faa 100644 --- a/docs/en/developers/OPML.md +++ b/docs/en/developers/OPML.md @@ -17,12 +17,14 @@ FreshRSS uses the XML namespace to export/import ext The list of the custom FreshRSS attributes can be seen in [the source code](https://github.com/FreshRSS/FreshRSS/blob/edge/app/views/helpers/export/opml.phtml), and here is an overview: -### HTML+XPath +### HTML+XPath or XML+XPath * ` ℹ️ [XPath 1.0](https://en.wikipedia.org/wiki/XPath) is a standard query language, which FreshRSS supports to enable [Web scraping](https://en.wikipedia.org/wiki/Web_scraping). +* ` $attributes */ function httpGet(string $url, string $cachePath, string $type = 'html', array $attributes = []): string { @@ -439,9 +443,15 @@ function httpGet(string $url, string $cachePath, string $type = 'html', array $a $accept = '*/*;q=0.8'; switch ($type) { + case 'json': + $accept = 'application/json,application/javascript;q=0.9,text/javascript;q=0.8,*/*;q=0.7'; + break; case 'opml': $accept = 'text/x-opml,text/xml;q=0.9,application/xml;q=0.9,*/*;q=0.8'; break; + case 'xml': + $accept = 'application/xml,application/xhtml+xml,text/xml;q=0.9,*/*;q=0.8'; + break; case 'html': default: $accept = 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'; diff --git a/p/scripts/feed.js b/p/scripts/feed.js index 1a6833db6..29af2a3ea 100644 --- a/p/scripts/feed.js +++ b/p/scripts/feed.js @@ -88,10 +88,17 @@ function init_disable_elements_on_update(parent) { function init_select_show(parent) { const listener = (select) => { const options = select.querySelectorAll('option[data-show]'); + const shows = {}; // To allow multiple options to show the same element for (const option of options) { - const elem = document.getElementById(option.dataset.show); + if (!shows[option.dataset.show]) { + shows[option.dataset.show] = option.selected; + } + } + + for (const show in shows) { + const elem = document.getElementById(show); if (elem) { - elem.style.display = option.selected ? 'block' : 'none'; + elem.style.display = shows[show] ? 'block' : 'none'; } } }; -- cgit v1.2.3