From 1a552bd60eab4a4b940d3896376b599e155d7da0 Mon Sep 17 00:00:00 2001 From: Alexandre Alapetite Date: Fri, 6 Sep 2024 09:35:58 +0200 Subject: Regex search (#6706) * Regex search fix https://github.com/FreshRSS/FreshRSS/issues/3549 * Fix PHPStan * Fix escape * Fix ungreedy * Initial support for regex search in PostgreSQL and MySQL * Improvements, support MySQL * Fix multiline * Add support for SQLite * A few tests * Added author: and inurl: support, documentation * author example * Remove \b for now * Disable regex sanitization for now * Fix getInurlRegex * getNotInurlRegex * Quotes for inurl: * Fix test * Fix quoted tags + regex for tags https://github.com/FreshRSS/FreshRSS/issues/6761 * Fix wrong regex detection * Add MariaDB * Fix logic * Increase requirements for MySQL and MariaDB Check support for multiline mode in MySQL * Remove sanitizeRegexes() * Allow searching HTML code Allow searching for instance `/
/`
Fix https://github.com/FreshRSS/FreshRSS/issues/6775#issuecomment-2331769883

* Doc regex search HTML

* Fix Doctype
---
 app/Models/EntryDAO.php | 116 +++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 114 insertions(+), 2 deletions(-)

(limited to 'app/Models/EntryDAO.php')

diff --git a/app/Models/EntryDAO.php b/app/Models/EntryDAO.php
index 175df15c3..6a00e2108 100644
--- a/app/Models/EntryDAO.php
+++ b/app/Models/EntryDAO.php
@@ -27,6 +27,55 @@ class FreshRSS_EntryDAO extends Minz_ModelPdo {
 		return str_replace('INSERT INTO ', 'INSERT IGNORE INTO ', $sql);
 	}
 
+	/** @return array{pattern?:string,matchType?:string} */
+	protected static function regexToSql(string $regex): array {
+		if (preg_match('#^/(?P.*)/(?P[im]*)$#', $regex, $matches)) {
+			return $matches;
+		}
+		return [];
+	}
+
+	/** @param array $values */
+	protected static function sqlRegex(string $expression, string $regex, array &$values): string {
+		// The implementation of this function is solely for MySQL and MariaDB
+		static $databaseDAOMySQL = null;
+		if ($databaseDAOMySQL === null) {
+			$databaseDAOMySQL = new FreshRSS_DatabaseDAO();
+		}
+
+		$matches = static::regexToSql($regex);
+		if (isset($matches['pattern'])) {
+			$matchType = $matches['matchType'] ?? '';
+			if ($databaseDAOMySQL->isMariaDB()) {
+				if (str_contains($matchType, 'm')) {
+					// multiline mode
+					$matches['pattern'] = '(?m)' . $matches['pattern'];
+				}
+				if (str_contains($matchType, 'i')) {
+					// case-insensitive match
+					$matches['pattern'] = '(?i)' . $matches['pattern'];
+				} else {
+					$matches['pattern'] = '(?-i)' . $matches['pattern'];
+				}
+				$values[] = $matches['pattern'];
+				return "{$expression} REGEXP ?";
+			} else {	// MySQL
+				if (!str_contains($matchType, 'i')) {
+					// Case-sensitive matching
+					$matchType .= 'c';
+				}
+				$values[] = $matches['pattern'];
+				return "REGEXP_LIKE({$expression},?,'{$matchType}')";
+			}
+		}
+		return '';
+	}
+
+	/** Register any needed SQL function for the query, e.g. application-defined functions for SQLite */
+	protected function registerSqlFunctions(string $sql): void {
+		// Nothing to do for MySQL
+	}
+
 	private function updateToMediumBlob(): bool {
 		if ($this->pdo->dbType() !== 'mysql') {
 			return false;
@@ -910,24 +959,44 @@ SQL;
 					$values[] = "%{$author}%";
 				}
 			}
+			if ($filter->getAuthorRegex() !== null) {
+				foreach ($filter->getAuthorRegex() as $author) {
+					$sub_search .= 'AND ' . static::sqlRegex("REPLACE({$alias}author, ';', '\n')", $author, $values) . ' ';
+				}
+			}
 			if ($filter->getIntitle() !== null) {
 				foreach ($filter->getIntitle() as $title) {
 					$sub_search .= 'AND ' . $alias . 'title LIKE ? ';
 					$values[] = "%{$title}%";
 				}
 			}
+			if ($filter->getIntitleRegex() !== null) {
+				foreach ($filter->getIntitleRegex() as $title) {
+					$sub_search .= 'AND ' . static::sqlRegex($alias . 'title', $title, $values) . ' ';
+				}
+			}
 			if ($filter->getTags() !== null) {
 				foreach ($filter->getTags() as $tag) {
 					$sub_search .= 'AND ' . static::sqlConcat('TRIM(' . $alias . 'tags) ', " ' #'") . ' LIKE ? ';
 					$values[] = "%{$tag} #%";
 				}
 			}
+			if ($filter->getTagsRegex() !== null) {
+				foreach ($filter->getTagsRegex() as $tag) {
+					$sub_search .= 'AND ' . static::sqlRegex("REPLACE(REPLACE({$alias}tags, ' #', '#'), '#', '\n')", $tag, $values) . ' ';
+				}
+			}
 			if ($filter->getInurl() !== null) {
 				foreach ($filter->getInurl() as $url) {
 					$sub_search .= 'AND ' . $alias . 'link LIKE ? ';
 					$values[] = "%{$url}%";
 				}
 			}
+			if ($filter->getInurlRegex() !== null) {
+				foreach ($filter->getInurlRegex() as $url) {
+					$sub_search .= 'AND ' . static::sqlRegex($alias . 'link', $url, $values) . ' ';
+				}
+			}
 
 			if ($filter->getNotAuthor() !== null) {
 				foreach ($filter->getNotAuthor() as $author) {
@@ -935,29 +1004,49 @@ SQL;
 					$values[] = "%{$author}%";
 				}
 			}
+			if ($filter->getNotAuthorRegex() !== null) {
+				foreach ($filter->getNotAuthorRegex() as $author) {
+					$sub_search .= 'AND NOT ' . static::sqlRegex("REPLACE({$alias}author, ';', '\n')", $author, $values) . ' ';
+				}
+			}
 			if ($filter->getNotIntitle() !== null) {
 				foreach ($filter->getNotIntitle() as $title) {
 					$sub_search .= 'AND ' . $alias . 'title NOT LIKE ? ';
 					$values[] = "%{$title}%";
 				}
 			}
+			if ($filter->getNotIntitleRegex() !== null) {
+				foreach ($filter->getNotIntitleRegex() as $title) {
+					$sub_search .= 'AND NOT ' . static::sqlRegex($alias . 'title', $title, $values) . ' ';
+				}
+			}
 			if ($filter->getNotTags() !== null) {
 				foreach ($filter->getNotTags() as $tag) {
 					$sub_search .= 'AND ' . static::sqlConcat('TRIM(' . $alias . 'tags) ', " ' #'") . ' NOT LIKE ? ';
 					$values[] = "%{$tag} #%";
 				}
 			}
+			if ($filter->getNotTagsRegex() !== null) {
+				foreach ($filter->getNotTagsRegex() as $tag) {
+					$sub_search .= 'AND NOT ' . static::sqlRegex("REPLACE(REPLACE({$alias}tags, ' #', '#'), '#', '\n')", $tag, $values) . ' ';
+				}
+			}
 			if ($filter->getNotInurl() !== null) {
 				foreach ($filter->getNotInurl() as $url) {
 					$sub_search .= 'AND ' . $alias . 'link NOT LIKE ? ';
 					$values[] = "%{$url}%";
 				}
 			}
+			if ($filter->getNotInurlRegex() !== null) {
+				foreach ($filter->getNotInurlRegex() as $url) {
+					$sub_search .= 'AND NOT ' . static::sqlRegex($alias . 'link', $url, $values) . ' ';
+				}
+			}
 
 			if ($filter->getSearch() !== null) {
 				foreach ($filter->getSearch() as $search_value) {
 					if (static::isCompressed()) {	// MySQL-only
-						$sub_search .= 'AND CONCAT(' . $alias . 'title, UNCOMPRESS(' . $alias . 'content_bin)) LIKE ? ';
+						$sub_search .= "AND CONCAT({$alias}title, '\\n', UNCOMPRESS({$alias}content_bin)) LIKE ? ";
 						$values[] = "%{$search_value}%";
 					} else {
 						$sub_search .= 'AND (' . $alias . 'title LIKE ? OR ' . $alias . 'content LIKE ?) ';
@@ -966,10 +1055,21 @@ SQL;
 					}
 				}
 			}
+			if ($filter->getSearchRegex() !== null) {
+				foreach ($filter->getSearchRegex() as $search_value) {
+					if (static::isCompressed()) {	// MySQL-only
+						$sub_search .= 'AND (' . static::sqlRegex($alias . 'title', $search_value, $values) .
+							' OR ' . static::sqlRegex("UNCOMPRESS({$alias}content_bin)", $search_value, $values) . ') ';
+					} else {
+						$sub_search .= 'AND (' . static::sqlRegex($alias . 'title', $search_value, $values) .
+							' OR ' . static::sqlRegex($alias . 'content', $search_value, $values) . ') ';
+					}
+				}
+			}
 			if ($filter->getNotSearch() !== null) {
 				foreach ($filter->getNotSearch() as $search_value) {
 					if (static::isCompressed()) {	// MySQL-only
-						$sub_search .= 'AND CONCAT(' . $alias . 'title, UNCOMPRESS(' . $alias . 'content_bin)) NOT LIKE ? ';
+						$sub_search .= "AND CONCAT({$alias}title, '\\n', UNCOMPRESS({$alias}content_bin)) NOT LIKE ? ";
 						$values[] = "%{$search_value}%";
 					} else {
 						$sub_search .= 'AND ' . $alias . 'title NOT LIKE ? AND ' . $alias . 'content NOT LIKE ? ';
@@ -978,6 +1078,17 @@ SQL;
 					}
 				}
 			}
+			if ($filter->getNotSearchRegex() !== null) {
+				foreach ($filter->getNotSearchRegex() as $search_value) {
+					if (static::isCompressed()) {	// MySQL-only
+						$sub_search .= 'AND NOT ' . static::sqlRegex($alias . 'title', $search_value, $values) .
+							' ANT NOT ' . static::sqlRegex("UNCOMPRESS({$alias}content_bin)", $search_value, $values) . ' ';
+					} else {
+						$sub_search .= 'AND NOT ' . static::sqlRegex($alias . 'title', $search_value, $values) .
+							' AND NOT ' . static::sqlRegex($alias . 'content', $search_value, $values) . ' ';
+					}
+				}
+			}
 
 			if ($sub_search != '') {
 				if ($isOpen) {
@@ -1039,6 +1150,7 @@ SQL;
 			if ($filterSearch !== '') {
 				$search .= 'AND (' . $filterSearch . ') ';
 				$values = array_merge($values, $filterValues);
+				$this->registerSqlFunctions($search);
 			}
 		}
 		return [$values, $search];
-- 
cgit v1.2.3