diff options
| author | 2023-02-09 13:57:20 +0100 | |
|---|---|---|
| committer | 2023-02-09 13:57:20 +0100 | |
| commit | 05ae1b0d2684cea4eda664c5ea1a995cb9f0c4b9 (patch) | |
| tree | 7f8bac745b5431139bec5197afc288ee694d28f5 /app | |
| parent | b9a62a6aaacf2763c45f503ed5602ba43bedfce0 (diff) | |
XML+XPath (#5076)
* XML+XPath
#fix https://github.com/FreshRSS/FreshRSS/issues/5075
Implementation allowing to take an XML document as input using an XML parser (instead of an HTML parser for HTML+XPath)
* Remove noise from another PR
* Better MIME for XML
* And add glob *.xml for cache cleaning
* Minor syntax
* Add glob json for clean cache
Diffstat (limited to 'app')
29 files changed, 74 insertions, 15 deletions
diff --git a/app/Controllers/feedController.php b/app/Controllers/feedController.php index 2bef85f0e..84f38fe5e 100644 --- a/app/Controllers/feedController.php +++ b/app/Controllers/feedController.php @@ -81,6 +81,7 @@ class FreshRSS_feed_Controller extends FreshRSS_ActionController { $feed->load(true); //Throws FreshRSS_Feed_Exception, Minz_FileNotExistException break; case FreshRSS_Feed::KIND_HTML_XPATH: + case FreshRSS_Feed::KIND_XML_XPATH: $feed->_website($url); break; } @@ -201,8 +202,8 @@ class FreshRSS_feed_Controller extends FreshRSS_ActionController { $timeout = intval(Minz_Request::param('timeout', 0)); $attributes['timeout'] = $timeout > 0 ? $timeout : null; - $feed_kind = Minz_Request::param('feed_kind', FreshRSS_Feed::KIND_RSS); - if ($feed_kind == FreshRSS_Feed::KIND_HTML_XPATH) { + $feed_kind = (int)Minz_Request::param('feed_kind', FreshRSS_Feed::KIND_RSS); + if ($feed_kind === FreshRSS_Feed::KIND_HTML_XPATH || $feed_kind === FreshRSS_Feed::KIND_XML_XPATH) { $xPathSettings = []; if (Minz_Request::param('xPathFeedTitle', '') != '') $xPathSettings['feedTitle'] = Minz_Request::param('xPathFeedTitle', '', true); if (Minz_Request::param('xPathItem', '') != '') $xPathSettings['item'] = Minz_Request::param('xPathItem', '', true); @@ -385,10 +386,15 @@ class FreshRSS_feed_Controller extends FreshRSS_ActionController { if ($simplePiePush) { $simplePie = $simplePiePush; //Used by WebSub } elseif ($feed->kind() === FreshRSS_Feed::KIND_HTML_XPATH) { - $simplePie = $feed->loadHtmlXpath(false, $isNewFeed); - if ($simplePie == null) { + $simplePie = $feed->loadHtmlXpath(); + if ($simplePie === null) { throw new FreshRSS_Feed_Exception('HTML+XPath Web scraping failed for [' . $feed->url(false) . ']'); } + } elseif ($feed->kind() === FreshRSS_Feed::KIND_XML_XPATH) { + $simplePie = $feed->loadHtmlXpath(); + if ($simplePie === null) { + throw new FreshRSS_Feed_Exception('XML+XPath parsing failed for [' . $feed->url(false) . ']'); + } } else { $simplePie = $feed->load(false, $isNewFeed); } diff --git a/app/Controllers/subscriptionController.php b/app/Controllers/subscriptionController.php index b2ee046d9..f0355a82a 100644 --- a/app/Controllers/subscriptionController.php +++ b/app/Controllers/subscriptionController.php @@ -203,7 +203,7 @@ class FreshRSS_subscription_Controller extends FreshRSS_ActionController { $feed->_filtersAction('read', preg_split('/[\n\r]+/', Minz_Request::param('filteractions_read', ''))); $feed->_kind(intval(Minz_Request::param('feed_kind', FreshRSS_Feed::KIND_RSS))); - if ($feed->kind() == FreshRSS_Feed::KIND_HTML_XPATH) { + if ($feed->kind() === FreshRSS_Feed::KIND_HTML_XPATH || $feed->kind() === FreshRSS_Feed::KIND_XML_XPATH) { $xPathSettings = []; if (Minz_Request::param('xPathItem', '') != '') $xPathSettings['item'] = Minz_Request::param('xPathItem', '', true); if (Minz_Request::param('xPathItemTitle', '') != '') $xPathSettings['itemTitle'] = Minz_Request::param('xPathItemTitle', '', true); diff --git a/app/Models/Feed.php b/app/Models/Feed.php index f7ff76768..7c46199a5 100644 --- a/app/Models/Feed.php +++ b/app/Models/Feed.php @@ -18,6 +18,11 @@ class FreshRSS_Feed extends Minz_Model { */ const KIND_HTML_XPATH = 10; /** + * Normal XML with XPath scraping + * @var int + */ + const KIND_XML_XPATH = 15; + /** * Normal JSON with XPath scraping * @var int */ @@ -586,7 +591,7 @@ class FreshRSS_Feed extends Minz_Model { /** * @return SimplePie|null */ - public function loadHtmlXpath(bool $loadDetails = false, bool $noCache = false) { + public function loadHtmlXpath() { if ($this->url == '') { return null; } @@ -614,8 +619,9 @@ class FreshRSS_Feed extends Minz_Model { return null; } - $cachePath = FreshRSS_Feed::cacheFilename($feedSourceUrl, $this->attributes(), FreshRSS_Feed::KIND_HTML_XPATH); - $html = httpGet($feedSourceUrl, $cachePath, 'html', $this->attributes()); + $cachePath = FreshRSS_Feed::cacheFilename($feedSourceUrl, $this->attributes(), $this->kind()); + $html = httpGet($feedSourceUrl, $cachePath, + $this->kind() === FreshRSS_Feed::KIND_XML_XPATH ? 'xml' : 'html', $this->attributes()); if (strlen($html) <= 0) { return null; } @@ -630,7 +636,18 @@ class FreshRSS_Feed extends Minz_Model { $doc = new DOMDocument(); $doc->recover = true; $doc->strictErrorChecking = false; - $doc->loadHTML($html, LIBXML_NONET | LIBXML_NOERROR | LIBXML_NOWARNING); + + switch ($this->kind()) { + case FreshRSS_Feed::KIND_HTML_XPATH: + $doc->loadHTML($html, LIBXML_NONET | LIBXML_NOERROR | LIBXML_NOWARNING); + break; + case FreshRSS_Feed::KIND_XML_XPATH: + $doc->loadXML($html, LIBXML_NONET | LIBXML_NOERROR | LIBXML_NOWARNING); + break; + default: + return null; + } + $xpath = new DOMXPath($doc); $view->rss_title = $xPathFeedTitle == '' ? $this->name() : htmlspecialchars(@$xpath->evaluate('normalize-space(' . $xPathFeedTitle . ')'), ENT_COMPAT, 'UTF-8'); @@ -776,8 +793,10 @@ class FreshRSS_Feed extends Minz_Model { public static function cacheFilename(string $url, array $attributes, int $kind = FreshRSS_Feed::KIND_RSS): string { $simplePie = customSimplePie($attributes); $filename = $simplePie->get_cache_filename($url); - if ($kind == FreshRSS_Feed::KIND_HTML_XPATH) { + if ($kind === FreshRSS_Feed::KIND_HTML_XPATH) { return CACHE_PATH . '/' . $filename . '.html'; + } elseif ($kind === FreshRSS_Feed::KIND_XML_XPATH) { + return CACHE_PATH . '/' . $filename . '.xml'; } else { return CACHE_PATH . '/' . $filename . '.spc'; } diff --git a/app/Services/ExportService.php b/app/Services/ExportService.php index 2f35666a8..6b0a3f178 100644 --- a/app/Services/ExportService.php +++ b/app/Services/ExportService.php @@ -21,6 +21,7 @@ class FreshRSS_Export_Service { const FRSS_NAMESPACE = 'https://freshrss.org/opml'; const TYPE_HTML_XPATH = 'HTML+XPath'; + const TYPE_XML_XPATH = 'XML+XPath'; const TYPE_RSS_ATOM = 'rss'; /** diff --git a/app/Services/ImportService.php b/app/Services/ImportService.php index 68aa6f741..55aa28679 100644 --- a/app/Services/ImportService.php +++ b/app/Services/ImportService.php @@ -160,10 +160,13 @@ class FreshRSS_Import_Service { $feed->_website($website); $feed->_description($description); - switch ($feed_elt['type'] ?? '') { + switch (strtolower($feed_elt['type'] ?? '')) { case strtolower(FreshRSS_Export_Service::TYPE_HTML_XPATH): $feed->_kind(FreshRSS_Feed::KIND_HTML_XPATH); break; + case strtolower(FreshRSS_Export_Service::TYPE_XML_XPATH): + $feed->_kind(FreshRSS_Feed::KIND_XML_XPATH); + break; case strtolower(FreshRSS_Export_Service::TYPE_RSS_ATOM): default: $feed->_kind(FreshRSS_Feed::KIND_RSS); diff --git a/app/i18n/cz/sub.php b/app/i18n/cz/sub.php index a11a9359d..3d08c315b 100644 --- a/app/i18n/cz/sub.php +++ b/app/i18n/cz/sub.php @@ -122,6 +122,7 @@ return array( 'xpath' => 'XPath pro:', ), 'rss' => 'RSS / Atom (výchozí)', + 'xml_xpath' => 'XML + XPath', // TODO ), 'maintenance' => array( 'clear_cache' => 'Vymazat mezipaměť', diff --git a/app/i18n/de/sub.php b/app/i18n/de/sub.php index 580f7d348..b265c1b98 100644 --- a/app/i18n/de/sub.php +++ b/app/i18n/de/sub.php @@ -122,6 +122,7 @@ return array( 'xpath' => 'XPath für:', ), 'rss' => 'RSS / Atom (Standard)', + 'xml_xpath' => 'XML + XPath', // TODO ), 'maintenance' => array( 'clear_cache' => 'Zwischenspeicher leeren', diff --git a/app/i18n/el/sub.php b/app/i18n/el/sub.php index 424fafc7b..aae9ae412 100644 --- a/app/i18n/el/sub.php +++ b/app/i18n/el/sub.php @@ -122,6 +122,7 @@ return array( 'xpath' => 'XPath for:', // TODO ), 'rss' => 'RSS / Atom (default)', // TODO + 'xml_xpath' => 'XML + XPath', // TODO ), 'maintenance' => array( 'clear_cache' => 'Clear cache', // TODO diff --git a/app/i18n/en-us/sub.php b/app/i18n/en-us/sub.php index a6b311084..92d75b81e 100644 --- a/app/i18n/en-us/sub.php +++ b/app/i18n/en-us/sub.php @@ -122,6 +122,7 @@ return array( 'xpath' => 'XPath for:', // IGNORE ), 'rss' => 'RSS / Atom (default)', // IGNORE + 'xml_xpath' => 'XML + XPath', // IGNORE ), 'maintenance' => array( 'clear_cache' => 'Clear cache', // IGNORE diff --git a/app/i18n/en/sub.php b/app/i18n/en/sub.php index c7e100c25..04caaff05 100644 --- a/app/i18n/en/sub.php +++ b/app/i18n/en/sub.php @@ -122,6 +122,7 @@ return array( 'xpath' => 'XPath for:', ), 'rss' => 'RSS / Atom (default)', + 'xml_xpath' => 'XML + XPath', // TODO ), 'maintenance' => array( 'clear_cache' => 'Clear cache', diff --git a/app/i18n/es/sub.php b/app/i18n/es/sub.php index 52d681067..4fd2fa393 100644 --- a/app/i18n/es/sub.php +++ b/app/i18n/es/sub.php @@ -122,6 +122,7 @@ return array( 'xpath' => 'XPath para:', ), 'rss' => 'RSS / Atom (por defecto)', + 'xml_xpath' => 'XML + XPath', // TODO ), 'maintenance' => array( 'clear_cache' => 'Borrar caché', diff --git a/app/i18n/fr/sub.php b/app/i18n/fr/sub.php index f9df0dbcc..be6dc094d 100644 --- a/app/i18n/fr/sub.php +++ b/app/i18n/fr/sub.php @@ -122,6 +122,7 @@ return array( 'xpath' => 'XPath pour :', ), 'rss' => 'RSS / Atom (par défaut)', + 'xml_xpath' => 'XML + XPath', // IGNORE ), 'maintenance' => array( 'clear_cache' => 'Vider le cache', diff --git a/app/i18n/he/sub.php b/app/i18n/he/sub.php index 25552ffa1..bae5f5177 100644 --- a/app/i18n/he/sub.php +++ b/app/i18n/he/sub.php @@ -122,6 +122,7 @@ return array( 'xpath' => 'XPath for:', // TODO ), 'rss' => 'RSS / Atom (default)', // TODO + 'xml_xpath' => 'XML + XPath', // TODO ), 'maintenance' => array( 'clear_cache' => 'Clear cache', // TODO diff --git a/app/i18n/id/sub.php b/app/i18n/id/sub.php index 7fdf5c024..3f9a4916a 100644 --- a/app/i18n/id/sub.php +++ b/app/i18n/id/sub.php @@ -122,6 +122,7 @@ return array( 'xpath' => 'XPath for:', // TODO ), 'rss' => 'RSS / Atom (default)', // TODO + 'xml_xpath' => 'XML + XPath', // TODO ), 'maintenance' => array( 'clear_cache' => 'Clear cache', // TODO diff --git a/app/i18n/it/sub.php b/app/i18n/it/sub.php index 8614caca7..7ab83cf07 100644 --- a/app/i18n/it/sub.php +++ b/app/i18n/it/sub.php @@ -122,6 +122,7 @@ return array( 'xpath' => 'XPath per:', ), 'rss' => 'RSS / Atom (predefinito)', + 'xml_xpath' => 'XML + XPath', // TODO ), 'maintenance' => array( 'clear_cache' => 'Svuota cache', diff --git a/app/i18n/ja/sub.php b/app/i18n/ja/sub.php index 80548c025..2425b21f3 100644 --- a/app/i18n/ja/sub.php +++ b/app/i18n/ja/sub.php @@ -122,6 +122,7 @@ return array( 'xpath' => 'XPathは:', ), 'rss' => 'RSS / Atom (標準)', + 'xml_xpath' => 'XML + XPath', // TODO ), 'maintenance' => array( 'clear_cache' => 'キャッシュのクリア', diff --git a/app/i18n/ko/sub.php b/app/i18n/ko/sub.php index e0ef5990b..f376247d5 100644 --- a/app/i18n/ko/sub.php +++ b/app/i18n/ko/sub.php @@ -122,6 +122,7 @@ return array( 'xpath' => '다음의 XPath:', ), 'rss' => 'RSS / Atom (기본값)', + 'xml_xpath' => 'XML + XPath', // TODO ), 'maintenance' => array( 'clear_cache' => '캐쉬 지우기', diff --git a/app/i18n/nl/sub.php b/app/i18n/nl/sub.php index 0fa767171..631da9477 100644 --- a/app/i18n/nl/sub.php +++ b/app/i18n/nl/sub.php @@ -122,6 +122,7 @@ return array( 'xpath' => 'XPath voor:', ), 'rss' => 'RSS / Atom (standaard)', + 'xml_xpath' => 'XML + XPath', // TODO ), 'maintenance' => array( 'clear_cache' => 'Cache leegmaken', diff --git a/app/i18n/oc/sub.php b/app/i18n/oc/sub.php index 92a73057c..008b4964d 100644 --- a/app/i18n/oc/sub.php +++ b/app/i18n/oc/sub.php @@ -122,6 +122,7 @@ return array( 'xpath' => 'XPath per :', ), 'rss' => 'RSS / Atom (defaut)', + 'xml_xpath' => 'XML + XPath', // TODO ), 'maintenance' => array( 'clear_cache' => 'Escafar lo cache', diff --git a/app/i18n/pl/sub.php b/app/i18n/pl/sub.php index b6121fcb7..565401982 100644 --- a/app/i18n/pl/sub.php +++ b/app/i18n/pl/sub.php @@ -122,6 +122,7 @@ return array( 'xpath' => 'XPath dla:', ), 'rss' => 'RSS / Atom (domyślne)', + 'xml_xpath' => 'XML + XPath', // TODO ), 'maintenance' => array( 'clear_cache' => 'Wyczyść pamięć podręczną', diff --git a/app/i18n/pt-br/sub.php b/app/i18n/pt-br/sub.php index c9755755e..4cdee8681 100644 --- a/app/i18n/pt-br/sub.php +++ b/app/i18n/pt-br/sub.php @@ -122,6 +122,7 @@ return array( 'xpath' => 'XPath para:', ), 'rss' => 'RSS / Atom (padrão)', + 'xml_xpath' => 'XML + XPath', // TODO ), 'maintenance' => array( 'clear_cache' => 'Limpar o cache', diff --git a/app/i18n/ru/sub.php b/app/i18n/ru/sub.php index 5704b53b1..d13c4c4f0 100644 --- a/app/i18n/ru/sub.php +++ b/app/i18n/ru/sub.php @@ -122,6 +122,7 @@ return array( 'xpath' => 'XPath для:', ), 'rss' => 'RSS / Atom (по умолчанию)', + 'xml_xpath' => 'XML + XPath', // TODO ), 'maintenance' => array( 'clear_cache' => 'Очистить кэш', diff --git a/app/i18n/sk/sub.php b/app/i18n/sk/sub.php index f583f6ca0..3c980d202 100644 --- a/app/i18n/sk/sub.php +++ b/app/i18n/sk/sub.php @@ -122,6 +122,7 @@ return array( 'xpath' => 'XPath pre:', ), 'rss' => 'RSS / Atom (prednastavené)', + 'xml_xpath' => 'XML + XPath', // TODO ), 'maintenance' => array( 'clear_cache' => 'Vymazať vyrovnáciu pamäť', diff --git a/app/i18n/tr/sub.php b/app/i18n/tr/sub.php index 056c059ac..3e03f667c 100644 --- a/app/i18n/tr/sub.php +++ b/app/i18n/tr/sub.php @@ -122,6 +122,7 @@ return array( 'xpath' => 'XPath:', ), 'rss' => 'RSS / Atom (varsayılan)', + 'xml_xpath' => 'XML + XPath', // TODO ), 'maintenance' => array( 'clear_cache' => 'Önbelleği temizle', diff --git a/app/i18n/zh-cn/sub.php b/app/i18n/zh-cn/sub.php index 2f9d17ace..5e6e570a9 100644 --- a/app/i18n/zh-cn/sub.php +++ b/app/i18n/zh-cn/sub.php @@ -122,6 +122,7 @@ return array( 'xpath' => 'XPath 定位:', ), 'rss' => 'RSS / Atom (默认)', + 'xml_xpath' => 'XML + XPath', // TODO ), 'maintenance' => array( 'clear_cache' => '清理缓存', diff --git a/app/i18n/zh-tw/sub.php b/app/i18n/zh-tw/sub.php index dddcb2661..8a255645d 100644 --- a/app/i18n/zh-tw/sub.php +++ b/app/i18n/zh-tw/sub.php @@ -122,6 +122,7 @@ return array( 'xpath' => 'XPath 定位:', ), 'rss' => 'RSS / Atom (默認)', + 'xml_xpath' => 'XML + XPath', // TODO ), 'maintenance' => array( 'clear_cache' => '清理暫存', diff --git a/app/views/helpers/export/opml.phtml b/app/views/helpers/export/opml.phtml index eb6f7523b..64c83c960 100644 --- a/app/views/helpers/export/opml.phtml +++ b/app/views/helpers/export/opml.phtml @@ -18,8 +18,15 @@ function feedsToOutlines($feeds, $excludeMutedFeeds = false): array { 'description' => htmlspecialchars_decode($feed->description(), ENT_QUOTES), ]; - if ($feed->kind() === FreshRSS_Feed::KIND_HTML_XPATH) { - $outline['type'] = FreshRSS_Export_Service::TYPE_HTML_XPATH; + if ($feed->kind() === FreshRSS_Feed::KIND_HTML_XPATH || $feed->kind() === FreshRSS_Feed::KIND_XML_XPATH) { + switch ($feed->kind()) { + case FreshRSS_Feed::KIND_HTML_XPATH: + $outline['type'] = FreshRSS_Export_Service::TYPE_HTML_XPATH; + break; + case FreshRSS_Feed::KIND_XML_XPATH: + $outline['type'] = FreshRSS_Export_Service::TYPE_XML_XPATH; + break; + } /** @var array<string,string> */ $xPathSettings = $feed->attributes('xpath'); $outline['frss:xPathItem'] = $xPathSettings['item'] ?? null; diff --git a/app/views/helpers/feed/update.phtml b/app/views/helpers/feed/update.phtml index 5b958451d..0cd2ec0c3 100644 --- a/app/views/helpers/feed/update.phtml +++ b/app/views/helpers/feed/update.phtml @@ -391,8 +391,9 @@ <label class="group-name" for="feed_kind"><?= _t('sub.feed.kind') ?></label> <div class="group-controls"> <select name="feed_kind" id="feed_kind" class="select-show w100"> - <option value="<?= FreshRSS_Feed::KIND_RSS ?>" <?= $this->feed->kind() == FreshRSS_Feed::KIND_RSS ? 'selected="selected"' : '' ?>><?= _t('sub.feed.kind.rss') ?></option> - <option value="<?= FreshRSS_Feed::KIND_HTML_XPATH ?>" <?= $this->feed->kind() == FreshRSS_Feed::KIND_HTML_XPATH ? 'selected="selected"' : '' ?> data-show="html_xpath"><?= _t('sub.feed.kind.html_xpath') ?></option> + <option value="<?= FreshRSS_Feed::KIND_RSS ?>" <?= $this->feed->kind() === FreshRSS_Feed::KIND_RSS ? 'selected="selected"' : '' ?>><?= _t('sub.feed.kind.rss') ?></option> + <option value="<?= FreshRSS_Feed::KIND_HTML_XPATH ?>" <?= $this->feed->kind() === FreshRSS_Feed::KIND_HTML_XPATH ? 'selected="selected"' : '' ?> data-show="html_xpath"><?= _t('sub.feed.kind.html_xpath') ?></option> + <option value="<?= FreshRSS_Feed::KIND_XML_XPATH ?>" <?= $this->feed->kind() === FreshRSS_Feed::KIND_XML_XPATH ? 'selected="selected"' : '' ?> data-show="html_xpath"><?= _t('sub.feed.kind.xml_xpath') ?></option> </select> </div> </div> diff --git a/app/views/subscription/add.phtml b/app/views/subscription/add.phtml index 7fa59e751..4e9da877f 100644 --- a/app/views/subscription/add.phtml +++ b/app/views/subscription/add.phtml @@ -70,6 +70,7 @@ <select name="feed_kind" id="feed_kind" class="select-show"> <option value="<?= FreshRSS_Feed::KIND_RSS ?>" selected="selected"><?= _t('sub.feed.kind.rss') ?></option> <option value="<?= FreshRSS_Feed::KIND_HTML_XPATH ?>" data-show="html_xpath"><?= _t('sub.feed.kind.html_xpath') ?></option> + <option value="<?= FreshRSS_Feed::KIND_XML_XPATH ?>" data-show="html_xpath"><?= _t('sub.feed.kind.xml_xpath') ?></option> </select> </div> </div> |
