diff options
| author | 2022-02-28 20:22:43 +0100 | |
|---|---|---|
| committer | 2022-02-28 20:22:43 +0100 | |
| commit | 1fe66ad020ca8f0560bb9c6e311852ed77228f78 (patch) | |
| tree | df78da3f33a9f13a9d6ba3f2744c369bd6e313a6 /app/Controllers/feedController.php | |
| parent | fa23ae76ea46b329fb65329081df95e864b03b23 (diff) | |
Implement Web scraping "HTML + XPath" (#4220)
* More PHP type hints for Fever
Follow-up of https://github.com/FreshRSS/FreshRSS/pull/4201
Related to https://github.com/FreshRSS/FreshRSS/issues/4200
* Detail
* Draft
* Progress
* More draft
* Fix thumbnail PHP type hint
https://github.com/FreshRSS/FreshRSS/issues/4215
* More types
* A bit more
* Refactor FreshRSS_Entry::fromArray
* Progress
* Starts to work
* Categories
* Fonctional
* Layout update
* Fix relative URLs
* Cache system
* Forgotten files
* Remove a debug line
* Automatic form validation of XPath expressions
* data-leave-validation
* Fix reload action
* Simpler examples
* Fix column type for PostgreSQL
* Enforce HTTP encoding
* Readme
* Fix get full content
* target="_blank"
* gitignore
* htmlspecialchars_utf8
* Implement HTML <base>
And fix/revert `xml:base` support in SimplePie https://github.com/simplepie/simplepie/commit/e49c578817aa504d8d05cd7f33857aeda9d41908
* SimplePie upstream PR merged
https://github.com/simplepie/simplepie/pull/723
Diffstat (limited to 'app/Controllers/feedController.php')
| -rwxr-xr-x | app/Controllers/feedController.php | 48 |
1 files changed, 43 insertions, 5 deletions
diff --git a/app/Controllers/feedController.php b/app/Controllers/feedController.php index f18a67072..dabfb348f 100755 --- a/app/Controllers/feedController.php +++ b/app/Controllers/feedController.php @@ -38,7 +38,7 @@ class FreshRSS_feed_Controller extends FreshRSS_ActionController { * @throws FreshRSS_Feed_Exception * @throws Minz_FileNotExistException */ - public static function addFeed($url, $title = '', $cat_id = 0, $new_cat_name = '', $http_auth = '', $attributes = array()) { + public static function addFeed($url, $title = '', $cat_id = 0, $new_cat_name = '', $http_auth = '', $attributes = array(), $kind = FreshRSS_Feed::KIND_RSS) { FreshRSS_UserDAO::touch(); @set_time_limit(300); @@ -67,10 +67,19 @@ class FreshRSS_feed_Controller extends FreshRSS_ActionController { $cat_id = $cat == null ? FreshRSS_CategoryDAO::DEFAULTCATEGORYID : $cat->id(); $feed = new FreshRSS_Feed($url); //Throws FreshRSS_BadUrl_Exception + $feed->_kind($kind); $feed->_attributes('', $attributes); $feed->_httpAuth($http_auth); - $feed->load(true); //Throws FreshRSS_Feed_Exception, Minz_FileNotExistException $feed->_category($cat_id); + switch ($kind) { + case FreshRSS_Feed::KIND_RSS: + case FreshRSS_Feed::KIND_RSS_FORCED: + $feed->load(true); //Throws FreshRSS_Feed_Exception, Minz_FileNotExistException + break; + case FreshRSS_Feed::KIND_HTML_XPATH: + $feed->_website($url); + break; + } $feedDAO = FreshRSS_Factory::createFeedDao(); if ($feedDAO->searchByUrl($feed->url())) { @@ -85,8 +94,9 @@ class FreshRSS_feed_Controller extends FreshRSS_ActionController { $values = array( 'url' => $feed->url(), + 'kind' => $feed->kind(), 'category' => $feed->category(), - 'name' => $title != '' ? $title : $feed->name(), + 'name' => $title != '' ? $title : $feed->name(true), 'website' => $feed->website(), 'description' => $feed->description(), 'lastUpdate' => 0, @@ -184,8 +194,25 @@ class FreshRSS_feed_Controller extends FreshRSS_ActionController { $timeout = intval(Minz_Request::param('timeout', 0)); $attributes['timeout'] = $timeout > 0 ? $timeout : null; + $feed_kind = Minz_Request::param('feed_kind', FreshRSS_Feed::KIND_RSS); + if ($feed_kind == FreshRSS_Feed::KIND_HTML_XPATH) { + $xPathSettings = []; + if (Minz_Request::param('xPathFeedTitle', '') != '') $xPathSettings['feedTitle'] = Minz_Request::param('xPathFeedTitle', '', true); + if (Minz_Request::param('xPathItem', '') != '') $xPathSettings['item'] = Minz_Request::param('xPathItem', '', true); + if (Minz_Request::param('xPathItemTitle', '') != '') $xPathSettings['itemTitle'] = Minz_Request::param('xPathItemTitle', '', true); + if (Minz_Request::param('xPathItemContent', '') != '') $xPathSettings['itemContent'] = Minz_Request::param('xPathItemContent', '', true); + if (Minz_Request::param('xPathItemUri', '') != '') $xPathSettings['itemUri'] = Minz_Request::param('xPathItemUri', '', true); + if (Minz_Request::param('xPathItemAuthor', '') != '') $xPathSettings['itemAuthor'] = Minz_Request::param('xPathItemAuthor', '', true); + if (Minz_Request::param('xPathItemTimestamp', '') != '') $xPathSettings['itemTimestamp'] = Minz_Request::param('xPathItemTimestamp', '', true); + if (Minz_Request::param('xPathItemThumbnail', '') != '') $xPathSettings['itemThumbnail'] = Minz_Request::param('xPathItemThumbnail', '', true); + if (Minz_Request::param('xPathItemCategories', '') != '') $xPathSettings['itemCategories'] = Minz_Request::param('xPathItemCategories', '', true); + if (!empty($xPathSettings)) { + $attributes['xpath'] = $xPathSettings; + } + } + try { - $feed = self::addFeed($url, '', $cat, '', $http_auth, $attributes); + $feed = self::addFeed($url, '', $cat, '', $http_auth, $attributes, $feed_kind); } catch (FreshRSS_BadUrl_Exception $e) { // Given url was not a valid url! Minz_Log::warning($e->getMessage()); @@ -264,6 +291,14 @@ class FreshRSS_feed_Controller extends FreshRSS_ActionController { } } + /** + * @param int $feed_id + * @param string $feed_url + * @param bool $force + * @param SimplePie|null $simplePiePush + * @param bool $noCommit + * @param int $maxFeeds + */ public static function actualizeFeed($feed_id, $feed_url, $force, $simplePiePush = null, $noCommit = false, $maxFeeds = 10) { @set_time_limit(300); @@ -338,6 +373,8 @@ class FreshRSS_feed_Controller extends FreshRSS_ActionController { try { if ($simplePiePush) { $simplePie = $simplePiePush; //Used by WebSub + } elseif ($feed->kind() === FreshRSS_Feed::KIND_HTML_XPATH) { + $simplePie = $feed->loadHtmlXpath(false, $isNewFeed); } else { $simplePie = $feed->load(false, $isNewFeed); } @@ -377,6 +414,7 @@ class FreshRSS_feed_Controller extends FreshRSS_ActionController { $oldGuids = array(); // Add entries in database if possible. + /** @var FreshRSS_Entry $entry */ foreach ($entries as $entry) { if (isset($newGuids[$entry->guid()])) { continue; //Skip subsequent articles with same GUID @@ -765,7 +803,7 @@ class FreshRSS_feed_Controller extends FreshRSS_ActionController { //Re-fetch articles as if the feed was new. $feedDAO->updateFeed($feed->id(), [ 'lastUpdate' => 0 ]); - self::actualizeFeed($feed_id, null, false, null, true); + self::actualizeFeed($feed_id, '', false); //Extract all feed entries from database, load complete content and store them back in database. $entries = $entryDAO->listWhere('f', $feed_id, FreshRSS_Entry::STATE_ALL, 'DESC', 0); |
