aboutsummaryrefslogtreecommitdiff
path: root/app/Models
diff options
context:
space:
mode:
authorGravatar Alexandre Alapetite <alexandre@alapetite.fr> 2024-10-14 09:34:16 +0200
committerGravatar GitHub <noreply@github.com> 2024-10-14 09:34:16 +0200
commit256dcc21bb222184d5e917ea57cec334b74b96f4 (patch)
tree89455b225b6fb2f00b98cead6d5450282c4cc6ce /app/Models
parent40c4d798f0decf2d49005277c6d3a3d0e39bedfd (diff)
New unicity policies for feeds with bad GUIDs (#4487)
New set of unicity criteria options. New tolerance heuristic: > `$invalidGuidsTolerance` (default 0.05) The maximum ratio (rounded) of invalid GUIDs to tolerate before degrading the unicity criteria. > Example for 0.05 (5% rounded): tolerate 0 invalid GUIDs for up to 9 articles, 1 for 10, 2 for 30, 3 for 50, 4 for 70, 5 for 90, 6 for 110, etc. > The default value of 5% rounded was chosen to allow 1 invalid GUID for feeds of 10 articles, which is a frequently observed amount of articles.
Diffstat (limited to 'app/Models')
-rw-r--r--app/Models/Entry.php13
-rw-r--r--app/Models/Feed.php106
2 files changed, 87 insertions, 32 deletions
diff --git a/app/Models/Entry.php b/app/Models/Entry.php
index fe6702bcd..6e87fe5cd 100644
--- a/app/Models/Entry.php
+++ b/app/Models/Entry.php
@@ -456,7 +456,7 @@ HTML;
}
public function hash(): string {
- if ($this->hash == '') {
+ if ($this->hash === '') {
//Do not include $this->date because it may be automatically generated when lacking
$this->hash = md5($this->link . $this->title . $this->authors(true) . $this->originalContent() . $this->tags(true));
}
@@ -481,16 +481,11 @@ HTML;
$this->date_added = $value;
}
}
+
public function _guid(string $value): void {
- $value = trim($value);
- if (empty($value)) {
- $value = $this->link;
- if (empty($value)) {
- $value = $this->hash();
- }
- }
- $this->guid = $value;
+ $this->guid = trim($value);
}
+
public function _title(string $value): void {
$this->hash = '';
$this->title = trim($value);
diff --git a/app/Models/Feed.php b/app/Models/Feed.php
index ad84c35a1..b5b599d5f 100644
--- a/app/Models/Feed.php
+++ b/app/Models/Feed.php
@@ -429,14 +429,61 @@ class FreshRSS_Feed extends Minz_Model {
}
/**
+ * Decide the GUID of an entry based on the feed’s policy.
+ * @param \SimplePie\Item $item The item to decide the GUID for.
+ * @param bool $fallback Whether to automatically switch to the next policy in case of blank GUID.
+ * @return string The decided GUID for the entry.
+ */
+ protected function decideEntryGuid(\SimplePie\Item $item, bool $fallback = false): string {
+ $unicityCriteria = $this->attributeString('unicityCriteria');
+ if ($this->attributeBoolean('hasBadGuids')) { // Legacy
+ $unicityCriteria = 'link';
+ }
+
+ $entryId = safe_ascii($item->get_id(false, false));
+
+ $guid = match ($unicityCriteria) {
+ null => $entryId,
+ 'link' => $item->get_permalink() ?? '',
+ 'sha1:link_published' => sha1($item->get_permalink() . $item->get_date('U')),
+ 'sha1:link_published_title' => sha1($item->get_permalink() . $item->get_date('U') . $item->get_title()),
+ 'sha1:link_published_title_content' => sha1($item->get_permalink() . $item->get_date('U') . $item->get_title() . $item->get_content()),
+ default => $entryId,
+ };
+
+ $blankHash = 'da39a3ee5e6b4b0d3255bfef95601890afd80709'; // sha1('')
+ if ($guid === $blankHash) {
+ $guid = '';
+ }
+
+ if ($fallback && $guid === '') {
+ if ($entryId !== '') {
+ $guid = $entryId;
+ } elseif (($item->get_permalink() ?? '') !== '') {
+ $guid = sha1($item->get_permalink() . $item->get_date('U'));
+ } elseif (($item->get_title() ?? '') !== '') {
+ $guid = sha1($item->get_permalink() . $item->get_date('U') . $item->get_title());
+ } else {
+ $guid = sha1($item->get_permalink() . $item->get_date('U') . $item->get_title() . $item->get_content());
+ }
+ if ($guid === $blankHash) {
+ $guid = '';
+ }
+ }
+
+ return $guid;
+ }
+
+ /**
+ * @param float $invalidGuidsTolerance (default 0.05) The maximum ratio (rounded) of invalid GUIDs to tolerate before degrading the unicity criteria.
+ * Example for 0.05 (5% rounded): tolerate 0 invalid GUIDs for up to 9 articles, 1 for 10, 2 for 30, 3 for 50, 4 for 70, 5 for 90, 6 for 110, etc.
+ * The default value of 5% rounded was chosen to allow 1 invalid GUID for feeds of 10 articles, which is a frequently observed amount of articles.
* @return array<string>
*/
- public function loadGuids(\SimplePie\SimplePie $simplePie): array {
- $hasUniqueGuids = true;
+ public function loadGuids(\SimplePie\SimplePie $simplePie, float $invalidGuidsTolerance = 0.05): array {
+ $invalidGuids = 0;
$testGuids = [];
$guids = [];
- $links = [];
- $hadBadGuids = $this->attributeBoolean('hasBadGuids');
$items = $simplePie->get_items();
if (empty($items)) {
@@ -447,33 +494,46 @@ class FreshRSS_Feed extends Minz_Model {
if ($item == null) {
continue;
}
- $guid = safe_ascii($item->get_id(false, false));
- $hasUniqueGuids &= empty($testGuids['_' . $guid]);
+ $guid = $this->decideEntryGuid($item, fallback: true);
+ if ($guid === '' || !empty($testGuids['_' . $guid])) {
+ $invalidGuids++;
+ Minz_Log::debug('Invalid GUID [' . $guid . '] for feed ' . $this->url);
+ }
$testGuids['_' . $guid] = true;
$guids[] = $guid;
- $permalink = $item->get_permalink();
- if ($permalink != null) {
- $links[] = $permalink;
- }
}
- if ($hadBadGuids != !$hasUniqueGuids) {
- if ($hadBadGuids) {
- Minz_Log::warning('Feed has invalid GUIDs: ' . $this->url);
- } else {
- Minz_Log::warning('Feed has valid GUIDs again: ' . $this->url);
+ if ($invalidGuids > 0) {
+ Minz_Log::warning("Feed has {$invalidGuids} invalid GUIDs: " . $this->url);
+ if (!$this->attributeBoolean('unicityCriteriaForced') && $invalidGuids > round($invalidGuidsTolerance * count($items))) {
+ $unicityCriteria = $this->attributeString('unicityCriteria');
+ if ($this->attributeBoolean('hasBadGuids')) { // Legacy
+ $unicityCriteria = 'link';
+ }
+
+ // Automatic fallback to next (degraded) unicity criteria
+ $newUnicityCriteria = match ($unicityCriteria) {
+ null => 'sha1:link_published',
+ 'link' => 'sha1:link_published',
+ 'sha1:link_published' => 'sha1:link_published_title',
+ default => $unicityCriteria,
+ };
+
+ if ($newUnicityCriteria !== $unicityCriteria) {
+ $this->_attribute('hasBadGuids', null); // Remove legacy
+ $this->_attribute('unicityCriteria', $newUnicityCriteria);
+ Minz_Log::warning('Feed unicity policy degraded (' . ($unicityCriteria ?: 'id') . ' → ' . $newUnicityCriteria . '): ' . $this->url);
+ return $this->loadGuids($simplePie, $invalidGuidsTolerance);
+ }
}
- $feedDAO = FreshRSS_Factory::createFeedDao();
- $feedDAO->updateFeedAttribute($this, 'hasBadGuids', !$hasUniqueGuids);
+ $this->_error(true);
}
- return $hasUniqueGuids ? $guids : $links;
+ return $guids;
}
/** @return Traversable<FreshRSS_Entry> */
public function loadEntries(\SimplePie\SimplePie $simplePie): Traversable {
- $hasBadGuids = $this->attributeBoolean('hasBadGuids');
-
$items = $simplePie->get_items();
if (empty($items)) {
return;
@@ -487,7 +547,7 @@ class FreshRSS_Feed extends Minz_Model {
$title = html_only_entity_decode(strip_tags($item->get_title() ?? ''));
$authors = $item->get_authors();
$link = $item->get_permalink();
- $date = @strtotime((string)($item->get_date() ?? '')) ?: 0;
+ $date = $item->get_date('U');
//Tag processing (tag == category)
$categories = $item->get_categories();
@@ -571,7 +631,7 @@ class FreshRSS_Feed extends Minz_Model {
}
}
- $guid = safe_ascii($item->get_id(false, false));
+ $guid = $this->decideEntryGuid($item, fallback: true);
unset($item);
$authorNames = '';
@@ -587,7 +647,7 @@ class FreshRSS_Feed extends Minz_Model {
$entry = new FreshRSS_Entry(
$this->id(),
- $hasBadGuids ? '' : $guid,
+ $guid,
$title == '' ? '' : $title,
$authorNames,
$content == '' ? '' : $content,