From 1a552bd60eab4a4b940d3896376b599e155d7da0 Mon Sep 17 00:00:00 2001 From: Alexandre Alapetite Date: Fri, 6 Sep 2024 09:35:58 +0200 Subject: Regex search (#6706) * Regex search fix https://github.com/FreshRSS/FreshRSS/issues/3549 * Fix PHPStan * Fix escape * Fix ungreedy * Initial support for regex search in PostgreSQL and MySQL * Improvements, support MySQL * Fix multiline * Add support for SQLite * A few tests * Added author: and inurl: support, documentation * author example * Remove \b for now * Disable regex sanitization for now * Fix getInurlRegex * getNotInurlRegex * Quotes for inurl: * Fix test * Fix quoted tags + regex for tags https://github.com/FreshRSS/FreshRSS/issues/6761 * Fix wrong regex detection * Add MariaDB * Fix logic * Increase requirements for MySQL and MariaDB Check support for multiline mode in MySQL * Remove sanitizeRegexes() * Allow searching HTML code Allow searching for instance `/
/`
Fix https://github.com/FreshRSS/FreshRSS/issues/6775#issuecomment-2331769883

* Doc regex search HTML

* Fix Doctype
---
 app/Models/Search.php | 207 +++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 173 insertions(+), 34 deletions(-)

(limited to 'app/Models/Search.php')

diff --git a/app/Models/Search.php b/app/Models/Search.php
index 7eaf741c3..755cf6b59 100644
--- a/app/Models/Search.php
+++ b/app/Models/Search.php
@@ -27,6 +27,8 @@ class FreshRSS_Search {
 	private ?array $label_names = null;
 	/** @var array|null */
 	private ?array $intitle = null;
+	/** @var array|null */
+	private ?array $intitle_regex = null;
 	/** @var int|false|null */
 	private $min_date = null;
 	/** @var int|false|null */
@@ -38,11 +40,19 @@ class FreshRSS_Search {
 	/** @var array|null */
 	private ?array $inurl = null;
 	/** @var array|null */
+	private ?array $inurl_regex = null;
+	/** @var array|null */
 	private ?array $author = null;
 	/** @var array|null */
+	private ?array $author_regex = null;
+	/** @var array|null */
 	private ?array $tags = null;
 	/** @var array|null */
+	private ?array $tags_regex = null;
+	/** @var array|null */
 	private ?array $search = null;
+	/** @var array|null */
+	private ?array $search_regex = null;
 
 	/** @var array|null */
 	private ?array $not_entry_ids = null;
@@ -54,6 +64,8 @@ class FreshRSS_Search {
 	private ?array $not_label_names = null;
 	/** @var array|null */
 	private ?array $not_intitle = null;
+	/** @var array|null */
+	private ?array $not_intitle_regex = null;
 	/** @var int|false|null */
 	private $not_min_date = null;
 	/** @var int|false|null */
@@ -65,11 +77,19 @@ class FreshRSS_Search {
 	/** @var array|null */
 	private ?array $not_inurl = null;
 	/** @var array|null */
+	private ?array $not_inurl_regex = null;
+	/** @var array|null */
 	private ?array $not_author = null;
 	/** @var array|null */
+	private ?array $not_author_regex = null;
+	/** @var array|null */
 	private ?array $not_tags = null;
 	/** @var array|null */
+	private ?array $not_tags_regex = null;
+	/** @var array|null */
 	private ?array $not_search = null;
+	/** @var array|null */
+	private ?array $not_search_regex = null;
 
 	public function __construct(string $input) {
 		$input = self::cleanSearch($input);
@@ -156,9 +176,17 @@ class FreshRSS_Search {
 		return $this->intitle;
 	}
 	/** @return array|null */
+	public function getIntitleRegex(): ?array {
+		return $this->intitle_regex;
+	}
+	/** @return array|null */
 	public function getNotIntitle(): ?array {
 		return $this->not_intitle;
 	}
+	/** @return array|null */
+	public function getNotIntitleRegex(): ?array {
+		return $this->not_intitle_regex;
+	}
 
 	public function getMinDate(): ?int {
 		return $this->min_date ?: null;
@@ -199,36 +227,68 @@ class FreshRSS_Search {
 		return $this->inurl;
 	}
 	/** @return array|null */
+	public function getInurlRegex(): ?array {
+		return $this->inurl_regex;
+	}
+	/** @return array|null */
 	public function getNotInurl(): ?array {
 		return $this->not_inurl;
 	}
+	/** @return array|null */
+	public function getNotInurlRegex(): ?array {
+		return $this->not_inurl_regex;
+	}
 
 	/** @return array|null */
 	public function getAuthor(): ?array {
 		return $this->author;
 	}
 	/** @return array|null */
+	public function getAuthorRegex(): ?array {
+		return $this->author_regex;
+	}
+	/** @return array|null */
 	public function getNotAuthor(): ?array {
 		return $this->not_author;
 	}
+	/** @return array|null */
+	public function getNotAuthorRegex(): ?array {
+		return $this->not_author_regex;
+	}
 
 	/** @return array|null */
 	public function getTags(): ?array {
 		return $this->tags;
 	}
 	/** @return array|null */
+	public function getTagsRegex(): ?array {
+		return $this->tags_regex;
+	}
+	/** @return array|null */
 	public function getNotTags(): ?array {
 		return $this->not_tags;
 	}
+	/** @return array|null */
+	public function getNotTagsRegex(): ?array {
+		return $this->not_tags_regex;
+	}
 
 	/** @return array|null */
 	public function getSearch(): ?array {
 		return $this->search;
 	}
 	/** @return array|null */
+	public function getSearchRegex(): ?array {
+		return $this->search_regex;
+	}
+	/** @return array|null */
 	public function getNotSearch(): ?array {
 		return $this->not_search;
 	}
+	/** @return array|null */
+	public function getNotSearchRegex(): ?array {
+		return $this->not_search_regex;
+	}
 
 	/**
 	 * @param array|null $anArray
@@ -253,11 +313,19 @@ class FreshRSS_Search {
 		return $value;
 	}
 
+	/**
+	 * @param array $strings
+	 * @return array
+	 */
+	private static function htmlspecialchars_decodes(array $strings): array {
+		return array_map(static fn(string $s) => htmlspecialchars_decode($s, ENT_QUOTES), $strings);
+	}
+
 	/**
 	 * Parse the search string to find entry (article) IDs.
 	 */
 	private function parseEntryIds(string $input): string {
-		if (preg_match_all('/\be:(?P[0-9,]*)/', $input, $matches)) {
+		if (preg_match_all('/\\be:(?P[0-9,]*)/', $input, $matches)) {
 			$input = str_replace($matches[0], '', $input);
 			$ids_lists = $matches['search'];
 			$this->entry_ids = [];
@@ -273,7 +341,7 @@ class FreshRSS_Search {
 	}
 
 	private function parseNotEntryIds(string $input): string {
-		if (preg_match_all('/(?<=\s|^)[!-]e:(?P[0-9,]*)/', $input, $matches)) {
+		if (preg_match_all('/(?<=\\s|^)[!-]e:(?P[0-9,]*)/', $input, $matches)) {
 			$input = str_replace($matches[0], '', $input);
 			$ids_lists = $matches['search'];
 			$this->not_entry_ids = [];
@@ -289,7 +357,7 @@ class FreshRSS_Search {
 	}
 
 	private function parseFeedIds(string $input): string {
-		if (preg_match_all('/\bf:(?P[0-9,]*)/', $input, $matches)) {
+		if (preg_match_all('/\\bf:(?P[0-9,]*)/', $input, $matches)) {
 			$input = str_replace($matches[0], '', $input);
 			$ids_lists = $matches['search'];
 			$this->feed_ids = [];
@@ -307,7 +375,7 @@ class FreshRSS_Search {
 	}
 
 	private function parseNotFeedIds(string $input): string {
-		if (preg_match_all('/(?<=\s|^)[!-]f:(?P[0-9,]*)/', $input, $matches)) {
+		if (preg_match_all('/(?<=\\s|^)[!-]f:(?P[0-9,]*)/', $input, $matches)) {
 			$input = str_replace($matches[0], '', $input);
 			$ids_lists = $matches['search'];
 			$this->not_feed_ids = [];
@@ -328,7 +396,7 @@ class FreshRSS_Search {
 	 * Parse the search string to find tags (labels) IDs.
 	 */
 	private function parseLabelIds(string $input): string {
-		if (preg_match_all('/\b[lL]:(?P[0-9,]+|[*])/', $input, $matches)) {
+		if (preg_match_all('/\\b[lL]:(?P[0-9,]+|[*])/', $input, $matches)) {
 			$input = str_replace($matches[0], '', $input);
 			$ids_lists = $matches['search'];
 			$this->label_ids = [];
@@ -350,7 +418,7 @@ class FreshRSS_Search {
 	}
 
 	private function parseNotLabelIds(string $input): string {
-		if (preg_match_all('/(?<=\s|^)[!-][lL]:(?P[0-9,]+|[*])/', $input, $matches)) {
+		if (preg_match_all('/(?<=\\s|^)[!-][lL]:(?P[0-9,]+|[*])/', $input, $matches)) {
 			$input = str_replace($matches[0], '', $input);
 			$ids_lists = $matches['search'];
 			$this->not_label_ids = [];
@@ -376,11 +444,11 @@ class FreshRSS_Search {
 	 */
 	private function parseLabelNames(string $input): string {
 		$names_lists = [];
-		if (preg_match_all('/\blabels?:(?P[\'"])(?P.*)(?P=delim)/U', $input, $matches)) {
+		if (preg_match_all('/\\blabels?:(?P[\'"])(?P.*)(?P=delim)/U', $input, $matches)) {
 			$names_lists = $matches['search'];
 			$input = str_replace($matches[0], '', $input);
 		}
-		if (preg_match_all('/\blabels?:(?P[^\s"]*)/', $input, $matches)) {
+		if (preg_match_all('/\\blabels?:(?P[^\s"]*)/', $input, $matches)) {
 			$names_lists = array_merge($names_lists, $matches['search']);
 			$input = str_replace($matches[0], '', $input);
 		}
@@ -402,11 +470,11 @@ class FreshRSS_Search {
 	 */
 	private function parseNotLabelNames(string $input): string {
 		$names_lists = [];
-		if (preg_match_all('/(?<=\s|^)[!-]labels?:(?P[\'"])(?P.*)(?P=delim)/U', $input, $matches)) {
+		if (preg_match_all('/(?<=\\s|^)[!-]labels?:(?P[\'"])(?P.*)(?P=delim)/U', $input, $matches)) {
 			$names_lists = $matches['search'];
 			$input = str_replace($matches[0], '', $input);
 		}
-		if (preg_match_all('/(?<=\s|^)[!-]labels?:(?P[^\s"]*)/', $input, $matches)) {
+		if (preg_match_all('/(?<=\\s|^)[!-]labels?:(?P[^\\s"]*)/', $input, $matches)) {
 			$names_lists = array_merge($names_lists, $matches['search']);
 			$input = str_replace($matches[0], '', $input);
 		}
@@ -428,11 +496,15 @@ class FreshRSS_Search {
 	 * The search is the first word following the keyword.
 	 */
 	private function parseIntitleSearch(string $input): string {
-		if (preg_match_all('/\bintitle:(?P[\'"])(?P.*)(?P=delim)/U', $input, $matches)) {
+		if (preg_match_all('#\\bintitle:(?P/.*?(?intitle_regex = self::htmlspecialchars_decodes($matches['search']);
+			$input = str_replace($matches[0], '', $input);
+		}
+		if (preg_match_all('/\\bintitle:(?P[\'"])(?P.*)(?P=delim)/U', $input, $matches)) {
 			$this->intitle = $matches['search'];
 			$input = str_replace($matches[0], '', $input);
 		}
-		if (preg_match_all('/\bintitle:(?P[^\s"]*)/', $input, $matches)) {
+		if (preg_match_all('/\\bintitle:(?P[^\s"]*)/', $input, $matches)) {
 			$this->intitle = array_merge($this->intitle ?: [], $matches['search']);
 			$input = str_replace($matches[0], '', $input);
 		}
@@ -444,11 +516,15 @@ class FreshRSS_Search {
 	}
 
 	private function parseNotIntitleSearch(string $input): string {
-		if (preg_match_all('/(?<=\s|^)[!-]intitle:(?P[\'"])(?P.*)(?P=delim)/U', $input, $matches)) {
+		if (preg_match_all('#(?<=\\s|^)[!-]intitle:(?P/.*?(?not_intitle_regex = self::htmlspecialchars_decodes($matches['search']);
+			$input = str_replace($matches[0], '', $input);
+		}
+		if (preg_match_all('/(?<=\\s|^)[!-]intitle:(?P[\'"])(?P.*)(?P=delim)/U', $input, $matches)) {
 			$this->not_intitle = $matches['search'];
 			$input = str_replace($matches[0], '', $input);
 		}
-		if (preg_match_all('/(?<=\s|^)[!-]intitle:(?P[^\s"]*)/', $input, $matches)) {
+		if (preg_match_all('/(?<=\\s|^)[!-]intitle:(?P[^\s"]*)/', $input, $matches)) {
 			$this->not_intitle = array_merge($this->not_intitle ?: [], $matches['search']);
 			$input = str_replace($matches[0], '', $input);
 		}
@@ -465,11 +541,15 @@ class FreshRSS_Search {
 	 * a delimiter. Supported delimiters are single quote (') and double quotes (").
 	 */
 	private function parseAuthorSearch(string $input): string {
-		if (preg_match_all('/\bauthor:(?P[\'"])(?P.*)(?P=delim)/U', $input, $matches)) {
+		if (preg_match_all('#\\bauthor:(?P/.*?(?author_regex = self::htmlspecialchars_decodes($matches['search']);
+			$input = str_replace($matches[0], '', $input);
+		}
+		if (preg_match_all('/\\bauthor:(?P[\'"])(?P.*)(?P=delim)/U', $input, $matches)) {
 			$this->author = $matches['search'];
 			$input = str_replace($matches[0], '', $input);
 		}
-		if (preg_match_all('/\bauthor:(?P[^\s"]*)/', $input, $matches)) {
+		if (preg_match_all('/\\bauthor:(?P[^\s"]*)/', $input, $matches)) {
 			$this->author = array_merge($this->author ?: [], $matches['search']);
 			$input = str_replace($matches[0], '', $input);
 		}
@@ -481,11 +561,15 @@ class FreshRSS_Search {
 	}
 
 	private function parseNotAuthorSearch(string $input): string {
-		if (preg_match_all('/(?<=\s|^)[!-]author:(?P[\'"])(?P.*)(?P=delim)/U', $input, $matches)) {
+		if (preg_match_all('#(?<=\\s|^)[!-]author:(?P/.*?(?not_author_regex = self::htmlspecialchars_decodes($matches['search']);
+			$input = str_replace($matches[0], '', $input);
+		}
+		if (preg_match_all('/(?<=\\s|^)[!-]author:(?P[\'"])(?P.*)(?P=delim)/U', $input, $matches)) {
 			$this->not_author = $matches['search'];
 			$input = str_replace($matches[0], '', $input);
 		}
-		if (preg_match_all('/(?<=\s|^)[!-]author:(?P[^\s"]*)/', $input, $matches)) {
+		if (preg_match_all('/(?<=\\s|^)[!-]author:(?P[^\s"]*)/', $input, $matches)) {
 			$this->not_author = array_merge($this->not_author ?: [], $matches['search']);
 			$input = str_replace($matches[0], '', $input);
 		}
@@ -501,19 +585,41 @@ class FreshRSS_Search {
 	 * The search is the first word following the keyword.
 	 */
 	private function parseInurlSearch(string $input): string {
-		if (preg_match_all('/\binurl:(?P[^\s]*)/', $input, $matches)) {
+		if (preg_match_all('#\\binurl:(?P/.*?(?inurl_regex = self::htmlspecialchars_decodes($matches['search']);
+			$input = str_replace($matches[0], '', $input);
+		}
+		if (preg_match_all('/\\binurl:(?P[\'"])(?P.*)(?P=delim)/U', $input, $matches)) {
 			$this->inurl = $matches['search'];
 			$input = str_replace($matches[0], '', $input);
-			$this->inurl = self::removeEmptyValues($this->inurl);
+		}
+		if (preg_match_all('/\\binurl:(?P[^\\s]*)/', $input, $matches)) {
+			$this->inurl = $matches['search'];
+			$input = str_replace($matches[0], '', $input);
+		}
+		$this->inurl = self::removeEmptyValues($this->inurl);
+		if (empty($this->inurl)) {
+			$this->inurl = null;
 		}
 		return $input;
 	}
 
 	private function parseNotInurlSearch(string $input): string {
-		if (preg_match_all('/(?<=\s|^)[!-]inurl:(?P[^\s]*)/', $input, $matches)) {
+		if (preg_match_all('#(?<=\\s|^)[!-]inurl:(?P/.*?(?not_inurl_regex = self::htmlspecialchars_decodes($matches['search']);
+			$input = str_replace($matches[0], '', $input);
+		}
+		if (preg_match_all('/(?<=\\s|^)[!-]inurl:(?P[\'"])(?P.*)(?P=delim)/U', $input, $matches)) {
 			$this->not_inurl = $matches['search'];
 			$input = str_replace($matches[0], '', $input);
-			$this->not_inurl = self::removeEmptyValues($this->not_inurl);
+		}
+		if (preg_match_all('/(?<=\\s|^)[!-]inurl:(?P[^\\s]*)/', $input, $matches)) {
+			$this->not_inurl = $matches['search'];
+			$input = str_replace($matches[0], '', $input);
+		}
+		$this->not_inurl = self::removeEmptyValues($this->not_inurl);
+		if (empty($this->not_inurl)) {
+			$this->not_inurl = null;
 		}
 		return $input;
 	}
@@ -523,7 +629,7 @@ class FreshRSS_Search {
 	 * The search is the first word following the keyword.
 	 */
 	private function parseDateSearch(string $input): string {
-		if (preg_match_all('/\bdate:(?P[^\s]*)/', $input, $matches)) {
+		if (preg_match_all('/\\bdate:(?P[^\\s]*)/', $input, $matches)) {
 			$input = str_replace($matches[0], '', $input);
 			$dates = self::removeEmptyValues($matches['search']);
 			if (!empty($dates[0])) {
@@ -534,7 +640,7 @@ class FreshRSS_Search {
 	}
 
 	private function parseNotDateSearch(string $input): string {
-		if (preg_match_all('/(?<=\s|^)[!-]date:(?P[^\s]*)/', $input, $matches)) {
+		if (preg_match_all('/(?<=\\s|^)[!-]date:(?P[^\\s]*)/', $input, $matches)) {
 			$input = str_replace($matches[0], '', $input);
 			$dates = self::removeEmptyValues($matches['search']);
 			if (!empty($dates[0])) {
@@ -550,7 +656,7 @@ class FreshRSS_Search {
 	 * The search is the first word following the keyword.
 	 */
 	private function parsePubdateSearch(string $input): string {
-		if (preg_match_all('/\bpubdate:(?P[^\s]*)/', $input, $matches)) {
+		if (preg_match_all('/\\bpubdate:(?P[^\\s]*)/', $input, $matches)) {
 			$input = str_replace($matches[0], '', $input);
 			$dates = self::removeEmptyValues($matches['search']);
 			if (!empty($dates[0])) {
@@ -561,7 +667,7 @@ class FreshRSS_Search {
 	}
 
 	private function parseNotPubdateSearch(string $input): string {
-		if (preg_match_all('/(?<=\s|^)[!-]pubdate:(?P[^\s]*)/', $input, $matches)) {
+		if (preg_match_all('/(?<=\\s|^)[!-]pubdate:(?P[^\\s]*)/', $input, $matches)) {
 			$input = str_replace($matches[0], '', $input);
 			$dates = self::removeEmptyValues($matches['search']);
 			if (!empty($dates[0])) {
@@ -577,20 +683,44 @@ class FreshRSS_Search {
 	 * The search is the first word following the #.
 	 */
 	private function parseTagsSearch(string $input): string {
-		if (preg_match_all('/#(?P[^\s]+)/', $input, $matches)) {
+		if (preg_match_all('%#(?P/.*?(?tags_regex = self::htmlspecialchars_decodes($matches['search']);
+			$input = str_replace($matches[0], '', $input);
+		}
+		if (preg_match_all('/#(?P[\'"])(?P.*)(?P=delim)/U', $input, $matches)) {
 			$this->tags = $matches['search'];
 			$input = str_replace($matches[0], '', $input);
-			$this->tags = self::removeEmptyValues($this->tags);
+		}
+		if (preg_match_all('/#(?P[^\\s]+)/', $input, $matches)) {
+			$this->tags = $matches['search'];
+			$input = str_replace($matches[0], '', $input);
+		}
+		$this->tags = self::removeEmptyValues($this->tags);
+		if (empty($this->tags)) {
+			$this->tags = null;
+		} else {
 			$this->tags = self::decodeSpaces($this->tags);
 		}
 		return $input;
 	}
 
 	private function parseNotTagsSearch(string $input): string {
-		if (preg_match_all('/(?<=\s|^)[!-]#(?P[^\s]+)/', $input, $matches)) {
+		if (preg_match_all('%(?<=\\s|^)[!-]#(?P/.*?(?not_tags_regex = self::htmlspecialchars_decodes($matches['search']);
+			$input = str_replace($matches[0], '', $input);
+		}
+		if (preg_match_all('/(?<=\\s|^)[!-]#(?P[\'"])(?P.*)(?P=delim)/U', $input, $matches)) {
+			$this->not_tags = $matches['search'];
+			$input = str_replace($matches[0], '', $input);
+		}
+		if (preg_match_all('/(?<=\\s|^)[!-]#(?P[^\\s]+)/', $input, $matches)) {
 			$this->not_tags = $matches['search'];
 			$input = str_replace($matches[0], '', $input);
-			$this->not_tags = self::removeEmptyValues($this->not_tags);
+		}
+		$this->not_tags = self::removeEmptyValues($this->not_tags);
+		if (empty($this->not_tags)) {
+			$this->not_tags = null;
+		} else {
 			$this->not_tags = self::decodeSpaces($this->not_tags);
 		}
 		return $input;
@@ -599,13 +729,18 @@ class FreshRSS_Search {
 	/**
 	 * Parse the search string to find search values.
 	 * Every word is a distinct search value using a delimiter.
-	 * Supported delimiters are single quote (') and double quotes (").
+	 * Supported delimiters are single quote (') and double quotes (") and regex (/).
 	 */
 	private function parseQuotedSearch(string $input): string {
 		$input = self::cleanSearch($input);
 		if ($input === '') {
 			return '';
 		}
+		if (preg_match_all('#(?<=\\s|^)(?/.*?(?search_regex = self::htmlspecialchars_decodes($matches['search']);
+			//TODO: Replace all those str_replace with PREG_OFFSET_CAPTURE
+			$input = str_replace($matches[0], '', $input);
+		}
 		if (preg_match_all('/(?[\'"])(?P.*)(?P=delim)/U', $input, $matches)) {
 			$this->search = $matches['search'];
 			//TODO: Replace all those str_replace with PREG_OFFSET_CAPTURE
@@ -636,7 +771,11 @@ class FreshRSS_Search {
 		if ($input === '') {
 			return '';
 		}
-		if (preg_match_all('/(?<=\s|^)[!-](?P[\'"])(?P.*)(?P=delim)/U', $input, $matches)) {
+		if (preg_match_all('#(?<=\\s|^)[!-](?P(?not_search_regex = self::htmlspecialchars_decodes($matches['search']);
+			$input = str_replace($matches[0], '', $input);
+		}
+		if (preg_match_all('/(?<=\\s|^)[!-](?P[\'"])(?P.*)(?P=delim)/U', $input, $matches)) {
 			$this->not_search = $matches['search'];
 			$input = str_replace($matches[0], '', $input);
 		}
@@ -644,7 +783,7 @@ class FreshRSS_Search {
 		if ($input === '') {
 			return '';
 		}
-		if (preg_match_all('/(?<=\s|^)[!-](?P[^\s]+)/', $input, $matches)) {
+		if (preg_match_all('/(?<=\\s|^)[!-](?P[^\\s]+)/', $input, $matches)) {
 			$this->not_search = array_merge(is_array($this->not_search) ? $this->not_search : [], $matches['search']);
 			$input = str_replace($matches[0], '', $input);
 		}
@@ -656,7 +795,7 @@ class FreshRSS_Search {
 	 * Remove all unnecessary spaces in the search
 	 */
 	private static function cleanSearch(string $input): string {
-		$input = preg_replace('/\s+/', ' ', $input);
+		$input = preg_replace('/\\s+/', ' ', $input);
 		if (!is_string($input)) {
 			return '';
 		}
-- 
cgit v1.2.3