diff options
| author | 2024-10-13 15:28:45 +0200 | |
|---|---|---|
| committer | 2024-10-13 15:28:45 +0200 | |
| commit | ccb132523a4ee740d5b576574e9f44668021fbe6 (patch) | |
| tree | 0b6977a345c56eff277abb0bc9199b0010f003a8 /app/Models/Feed.php | |
| parent | 91624037c7d73eb545478aab2f8abc55fc224453 (diff) | |
New feed mode: HTML + XPath + JSON dot notation (JSON in HTML) (#6888)
* New feed mode: HTML + XPath + JSON dot notation (JSON in HTML)
Same as `JSON+DotNotation` but first extracting the JSON string from an HTML document thanks to an XPath expression.
Example: `//script[@type='application/json']`
fix https://github.com/FreshRSS/FreshRSS/discussions/6876
* JavaScript UI to show/hide new field
* Casing xPathToJson
* Slight renaming
Diffstat (limited to 'app/Models/Feed.php')
| -rw-r--r-- | app/Models/Feed.php | 35 |
1 files changed, 31 insertions, 4 deletions
diff --git a/app/Models/Feed.php b/app/Models/Feed.php index b9afa9e83..ad84c35a1 100644 --- a/app/Models/Feed.php +++ b/app/Models/Feed.php @@ -32,6 +32,8 @@ class FreshRSS_Feed extends Minz_Model { public const KIND_JSONFEED = 25; public const KIND_JSON_DOTNOTATION = 30; + /** JSON embedded in HTML */ + public const KIND_HTML_XPATH_JSON_DOTNOTATION = 35; public const PRIORITY_IMPORTANT = 20; public const PRIORITY_MAIN_STREAM = 10; @@ -639,6 +641,24 @@ class FreshRSS_Feed extends Minz_Model { ]; } + private function extractJsonFromHtml(string $html): ?string { + $xPathToJson = $this->attributeString('xPathToJson') ?? ''; + if ($xPathToJson === '') { + return null; + } + + $doc = new DOMDocument(); + $doc->recover = true; + $doc->strictErrorChecking = false; + if (!$doc->loadHTML($html, LIBXML_NONET | LIBXML_NOERROR | LIBXML_NOWARNING)) { + return null; + } + + $xpath = new DOMXPath($doc); + $json = @$xpath->evaluate('normalize-space(' . $xPathToJson . ')'); + return is_string($json) ? $json : null; + } + public function loadJson(): ?\SimplePie\SimplePie { if ($this->url == '') { return null; @@ -648,14 +668,21 @@ class FreshRSS_Feed extends Minz_Model { return null; } - $httpAccept = 'json'; - $json = httpGet($feedSourceUrl, $this->cacheFilename(), $httpAccept, $this->attributes(), $this->curlOptions()); - if (strlen($json) <= 0) { + $httpAccept = $this->kind() === FreshRSS_Feed::KIND_HTML_XPATH_JSON_DOTNOTATION ? 'html' : 'json'; + $content = httpGet($feedSourceUrl, $this->cacheFilename(), $httpAccept, $this->attributes(), $this->curlOptions()); + if (strlen($content) <= 0) { return null; } + if ($this->kind() === FreshRSS_Feed::KIND_HTML_XPATH_JSON_DOTNOTATION) { + $content = $this->extractJsonFromHtml($content); + if ($content == null) { + return null; + } + } + //check if the content is actual JSON - $jf = json_decode($json, true); + $jf = json_decode($content, true); if (json_last_error() !== JSON_ERROR_NONE || !is_array($jf)) { return null; } |
