diff options
| author | 2024-10-14 09:34:16 +0200 | |
|---|---|---|
| committer | 2024-10-14 09:34:16 +0200 | |
| commit | 256dcc21bb222184d5e917ea57cec334b74b96f4 (patch) | |
| tree | 89455b225b6fb2f00b98cead6d5450282c4cc6ce /app/Models/Feed.php | |
| parent | 40c4d798f0decf2d49005277c6d3a3d0e39bedfd (diff) | |
New unicity policies for feeds with bad GUIDs (#4487)
New set of unicity criteria options.
New tolerance heuristic:
> `$invalidGuidsTolerance` (default 0.05) The maximum ratio (rounded) of invalid GUIDs to tolerate before degrading the unicity criteria.
> Example for 0.05 (5% rounded): tolerate 0 invalid GUIDs for up to 9 articles, 1 for 10, 2 for 30, 3 for 50, 4 for 70, 5 for 90, 6 for 110, etc.
> The default value of 5% rounded was chosen to allow 1 invalid GUID for feeds of 10 articles, which is a frequently observed amount of articles.
Diffstat (limited to 'app/Models/Feed.php')
| -rw-r--r-- | app/Models/Feed.php | 106 |
1 files changed, 83 insertions, 23 deletions
diff --git a/app/Models/Feed.php b/app/Models/Feed.php index ad84c35a1..b5b599d5f 100644 --- a/app/Models/Feed.php +++ b/app/Models/Feed.php @@ -429,14 +429,61 @@ class FreshRSS_Feed extends Minz_Model { } /** + * Decide the GUID of an entry based on the feed’s policy. + * @param \SimplePie\Item $item The item to decide the GUID for. + * @param bool $fallback Whether to automatically switch to the next policy in case of blank GUID. + * @return string The decided GUID for the entry. + */ + protected function decideEntryGuid(\SimplePie\Item $item, bool $fallback = false): string { + $unicityCriteria = $this->attributeString('unicityCriteria'); + if ($this->attributeBoolean('hasBadGuids')) { // Legacy + $unicityCriteria = 'link'; + } + + $entryId = safe_ascii($item->get_id(false, false)); + + $guid = match ($unicityCriteria) { + null => $entryId, + 'link' => $item->get_permalink() ?? '', + 'sha1:link_published' => sha1($item->get_permalink() . $item->get_date('U')), + 'sha1:link_published_title' => sha1($item->get_permalink() . $item->get_date('U') . $item->get_title()), + 'sha1:link_published_title_content' => sha1($item->get_permalink() . $item->get_date('U') . $item->get_title() . $item->get_content()), + default => $entryId, + }; + + $blankHash = 'da39a3ee5e6b4b0d3255bfef95601890afd80709'; // sha1('') + if ($guid === $blankHash) { + $guid = ''; + } + + if ($fallback && $guid === '') { + if ($entryId !== '') { + $guid = $entryId; + } elseif (($item->get_permalink() ?? '') !== '') { + $guid = sha1($item->get_permalink() . $item->get_date('U')); + } elseif (($item->get_title() ?? '') !== '') { + $guid = sha1($item->get_permalink() . $item->get_date('U') . $item->get_title()); + } else { + $guid = sha1($item->get_permalink() . $item->get_date('U') . $item->get_title() . $item->get_content()); + } + if ($guid === $blankHash) { + $guid = ''; + } + } + + return $guid; + } + + /** + * @param float $invalidGuidsTolerance (default 0.05) The maximum ratio (rounded) of invalid GUIDs to tolerate before degrading the unicity criteria. + * Example for 0.05 (5% rounded): tolerate 0 invalid GUIDs for up to 9 articles, 1 for 10, 2 for 30, 3 for 50, 4 for 70, 5 for 90, 6 for 110, etc. + * The default value of 5% rounded was chosen to allow 1 invalid GUID for feeds of 10 articles, which is a frequently observed amount of articles. * @return array<string> */ - public function loadGuids(\SimplePie\SimplePie $simplePie): array { - $hasUniqueGuids = true; + public function loadGuids(\SimplePie\SimplePie $simplePie, float $invalidGuidsTolerance = 0.05): array { + $invalidGuids = 0; $testGuids = []; $guids = []; - $links = []; - $hadBadGuids = $this->attributeBoolean('hasBadGuids'); $items = $simplePie->get_items(); if (empty($items)) { @@ -447,33 +494,46 @@ class FreshRSS_Feed extends Minz_Model { if ($item == null) { continue; } - $guid = safe_ascii($item->get_id(false, false)); - $hasUniqueGuids &= empty($testGuids['_' . $guid]); + $guid = $this->decideEntryGuid($item, fallback: true); + if ($guid === '' || !empty($testGuids['_' . $guid])) { + $invalidGuids++; + Minz_Log::debug('Invalid GUID [' . $guid . '] for feed ' . $this->url); + } $testGuids['_' . $guid] = true; $guids[] = $guid; - $permalink = $item->get_permalink(); - if ($permalink != null) { - $links[] = $permalink; - } } - if ($hadBadGuids != !$hasUniqueGuids) { - if ($hadBadGuids) { - Minz_Log::warning('Feed has invalid GUIDs: ' . $this->url); - } else { - Minz_Log::warning('Feed has valid GUIDs again: ' . $this->url); + if ($invalidGuids > 0) { + Minz_Log::warning("Feed has {$invalidGuids} invalid GUIDs: " . $this->url); + if (!$this->attributeBoolean('unicityCriteriaForced') && $invalidGuids > round($invalidGuidsTolerance * count($items))) { + $unicityCriteria = $this->attributeString('unicityCriteria'); + if ($this->attributeBoolean('hasBadGuids')) { // Legacy + $unicityCriteria = 'link'; + } + + // Automatic fallback to next (degraded) unicity criteria + $newUnicityCriteria = match ($unicityCriteria) { + null => 'sha1:link_published', + 'link' => 'sha1:link_published', + 'sha1:link_published' => 'sha1:link_published_title', + default => $unicityCriteria, + }; + + if ($newUnicityCriteria !== $unicityCriteria) { + $this->_attribute('hasBadGuids', null); // Remove legacy + $this->_attribute('unicityCriteria', $newUnicityCriteria); + Minz_Log::warning('Feed unicity policy degraded (' . ($unicityCriteria ?: 'id') . ' → ' . $newUnicityCriteria . '): ' . $this->url); + return $this->loadGuids($simplePie, $invalidGuidsTolerance); + } } - $feedDAO = FreshRSS_Factory::createFeedDao(); - $feedDAO->updateFeedAttribute($this, 'hasBadGuids', !$hasUniqueGuids); + $this->_error(true); } - return $hasUniqueGuids ? $guids : $links; + return $guids; } /** @return Traversable<FreshRSS_Entry> */ public function loadEntries(\SimplePie\SimplePie $simplePie): Traversable { - $hasBadGuids = $this->attributeBoolean('hasBadGuids'); - $items = $simplePie->get_items(); if (empty($items)) { return; @@ -487,7 +547,7 @@ class FreshRSS_Feed extends Minz_Model { $title = html_only_entity_decode(strip_tags($item->get_title() ?? '')); $authors = $item->get_authors(); $link = $item->get_permalink(); - $date = @strtotime((string)($item->get_date() ?? '')) ?: 0; + $date = $item->get_date('U'); //Tag processing (tag == category) $categories = $item->get_categories(); @@ -571,7 +631,7 @@ class FreshRSS_Feed extends Minz_Model { } } - $guid = safe_ascii($item->get_id(false, false)); + $guid = $this->decideEntryGuid($item, fallback: true); unset($item); $authorNames = ''; @@ -587,7 +647,7 @@ class FreshRSS_Feed extends Minz_Model { $entry = new FreshRSS_Entry( $this->id(), - $hasBadGuids ? '' : $guid, + $guid, $title == '' ? '' : $title, $authorNames, $content == '' ? '' : $content, |
