diff options
| author | 2024-08-01 19:44:58 +0200 | |
|---|---|---|
| committer | 2024-08-01 19:44:58 +0200 | |
| commit | 58de38bd737d2dba617944b54a98d4bdb5810588 (patch) | |
| tree | ff197baaabd4b3e12b29eb74f3d818d8f4a04e9a | |
| parent | 666e7b27ce4f9c8254e76373572f220ddfb2fd37 (diff) | |
Fix parentheses for complex `OR` boolean search expressions (#6672)
* Fix OR parentheses
* Pass all tests
* Forgotten comment
* Minor whitespace
* Fix several cases envolving negation
* Line length
* Fix `OR NOT`
| -rw-r--r-- | app/Models/BooleanSearch.php | 129 | ||||
| -rw-r--r-- | app/Models/Entry.php | 20 | ||||
| -rw-r--r-- | app/Models/EntryDAO.php | 2 | ||||
| -rw-r--r-- | tests/app/Models/SearchTest.php | 114 |
4 files changed, 245 insertions, 20 deletions
diff --git a/app/Models/BooleanSearch.php b/app/Models/BooleanSearch.php index 692738107..88069c461 100644 --- a/app/Models/BooleanSearch.php +++ b/app/Models/BooleanSearch.php @@ -11,11 +11,11 @@ class FreshRSS_BooleanSearch { private array $searches = []; /** - * @phpstan-var 'AND'|'OR'|'AND NOT' + * @phpstan-var 'AND'|'OR'|'AND NOT'|'OR NOT' */ private string $operator; - /** @param 'AND'|'OR'|'AND NOT' $operator */ + /** @param 'AND'|'OR'|'AND NOT'|'OR NOT' $operator */ public function __construct(string $input, int $level = 0, string $operator = 'AND', bool $allowUserQueries = true) { $this->operator = $operator; $input = trim($input); @@ -38,6 +38,8 @@ class FreshRSS_BooleanSearch { } $this->raw_input = $input; + $input = self::consistentOrParentheses($input); + // Either parse everything as a series of BooleanSearch’s combined by implicit AND // or parse everything as a series of Search’s combined by explicit OR $this->parseParentheses($input, $level) || $this->parseOrSegments($input); @@ -130,6 +132,107 @@ class FreshRSS_BooleanSearch { return $input; } + /** + * Example: 'ab cd OR ef OR "gh ij"' becomes '(ab cd) OR (ef) OR ("gh ij")' + */ + public static function addOrParentheses(string $input): string { + $input = trim($input); + if ($input === '') { + return ''; + } + $splits = preg_split('/\b(OR)\b/i', $input, -1, PREG_SPLIT_DELIM_CAPTURE) ?: []; + $ns = count($splits); + if ($ns <= 1) { + return $input; + } + $result = ''; + $segment = ''; + for ($i = 0; $i < $ns; $i++) { + $segment .= $splits[$i]; + if (trim($segment) === '') { + $segment = ''; + } elseif (strcasecmp($segment, 'OR') === 0) { + $result .= $segment . ' '; + $segment = ''; + } else { + $quotes = substr_count($segment, '"') + substr_count($segment, '"'); + if ($quotes % 2 === 0) { + $segment = trim($segment); + if (in_array($segment, ['!', '-'], true)) { + $result .= $segment; + } else { + $result .= '(' . $segment . ') '; + } + $segment = ''; + } + } + } + $segment = trim($segment); + if (in_array($segment, ['!', '-'], true)) { + $result .= $segment; + } elseif ($segment !== '') { + $result .= '(' . $segment . ')'; + } + return trim($result); + } + + /** + * If the query contains a mix of `OR` expressions with and without parentheses, + * then add parentheses to make the query consistent. + * Example: '(ab (cd OR ef)) OR gh OR ij OR (kl)' becomes '(ab ((cd) OR (ef))) OR (gh) OR (ij) OR (kl)' + */ + public static function consistentOrParentheses(string $input): string { + if (!preg_match('/(?<!\\\\)\\(/', $input)) { + // No unescaped parentheses in the input + return trim($input); + } + $parenthesesCount = 0; + $result = ''; + $segment = ''; + $length = strlen($input); + + for ($i = 0; $i < $length; $i++) { + $c = $input[$i]; + $backslashed = $i >= 1 ? $input[$i - 1] === '\\' : false; + if (!$backslashed) { + if ($c === '(') { + if ($parenthesesCount === 0) { + if ($segment !== '') { + $result = rtrim($result) . ' ' . self::addOrParentheses($segment); + $negation = preg_match('/[!-]$/', $result); + if (!$negation) { + $result .= ' '; + } + $segment = ''; + } + $c = ''; + } + $parenthesesCount++; + } elseif ($c === ')') { + $parenthesesCount--; + if ($parenthesesCount === 0) { + $segment = self::consistentOrParentheses($segment); + if ($segment !== '') { + $result .= '(' . $segment . ')'; + $segment = ''; + } + $c = ''; + } + } + } + $segment .= $c; + } + if (trim($segment) !== '') { + $result = rtrim($result); + $negation = preg_match('/[!-]$/', $segment); + if (!$negation) { + $result .= ' '; + } + $result .= self::addOrParentheses($segment); + } + return trim($result); + } + /** @return bool True if some parenthesis logic took over, false otherwise */ private function parseParentheses(string $input, int $level): bool { $input = trim($input); @@ -146,9 +249,14 @@ class FreshRSS_BooleanSearch { $hasParenthesis = true; $before = trim($before); - if (preg_match('/[!-]$/i', $before)) { + if (preg_match('/[!-]$/', $before)) { // Trim trailing negation - $before = substr($before, 0, -1); + $before = rtrim($before, ' !-'); + $isOr = preg_match('/\bOR$/i', $before); + if ($isOr) { + // Trim trailing OR + $before = substr($before, 0, -2); + } // The text prior to the negation is a BooleanSearch $searchBefore = new FreshRSS_BooleanSearch($before, $level + 1, $nextOperator); @@ -157,8 +265,8 @@ class FreshRSS_BooleanSearch { } $before = ''; - // The next BooleanSearch will have to be combined with AND NOT instead of default AND - $nextOperator = 'AND NOT'; + // The next BooleanSearch will have to be combined with AND NOT or OR NOT instead of default AND + $nextOperator = $isOr ? 'OR NOT' : 'AND NOT'; } elseif (preg_match('/\bOR$/i', $before)) { // Trim trailing OR $before = substr($before, 0, -2); @@ -212,7 +320,7 @@ class FreshRSS_BooleanSearch { $i++; } // $sub = trim($sub); - // if ($sub != '') { + // if ($sub !== '') { // // TODO: Consider throwing an error or warning in case of non-matching parenthesis // } // } elseif ($c === ')') { @@ -249,12 +357,11 @@ class FreshRSS_BooleanSearch { return; } $splits = preg_split('/\b(OR)\b/i', $input, -1, PREG_SPLIT_DELIM_CAPTURE) ?: []; - $segment = ''; $ns = count($splits); for ($i = 0; $i < $ns; $i++) { $segment = $segment . $splits[$i]; - if (trim($segment) == '' || strcasecmp($segment, 'OR') === 0) { + if (trim($segment) === '' || strcasecmp($segment, 'OR') === 0) { $segment = ''; } else { $quotes = substr_count($segment, '"') + substr_count($segment, '"'); @@ -266,7 +373,7 @@ class FreshRSS_BooleanSearch { } } $segment = trim($segment); - if ($segment != '') { + if ($segment !== '') { $this->searches[] = new FreshRSS_Search($segment); } } @@ -280,7 +387,7 @@ class FreshRSS_BooleanSearch { return $this->searches; } - /** @return 'AND'|'OR'|'AND NOT' depending on how this BooleanSearch should be combined */ + /** @return 'AND'|'OR'|'AND NOT'|'OR NOT' depending on how this BooleanSearch should be combined */ public function operator(): string { return $this->operator; } diff --git a/app/Models/Entry.php b/app/Models/Entry.php index e4d728ba4..ada6e9944 100644 --- a/app/Models/Entry.php +++ b/app/Models/Entry.php @@ -577,12 +577,20 @@ HTML; foreach ($booleanSearch->searches() as $filter) { if ($filter instanceof FreshRSS_BooleanSearch) { // BooleanSearches are combined by AND (default) or OR or AND NOT (special cases) operators and are recursive - if ($filter->operator() === 'OR') { - $ok |= $this->matches($filter); - } elseif ($filter->operator() === 'AND NOT') { - $ok &= !$this->matches($filter); - } else { // AND - $ok &= $this->matches($filter); + switch ($filter->operator()) { + case 'OR': + $ok |= $this->matches($filter); + break; + case 'OR NOT': + $ok |= !$this->matches($filter); + break; + case 'AND NOT': + $ok &= !$this->matches($filter); + break; + case 'AND': + default: + $ok &= $this->matches($filter); + break; } } elseif ($filter instanceof FreshRSS_Search) { // Searches are combined by OR and are not recursive diff --git a/app/Models/EntryDAO.php b/app/Models/EntryDAO.php index ec627dda1..ba0cf1970 100644 --- a/app/Models/EntryDAO.php +++ b/app/Models/EntryDAO.php @@ -762,7 +762,7 @@ SQL; if ($filterSearch !== '') { if ($search !== '') { $search .= $filter->operator(); - } elseif ($filter->operator() === 'AND NOT') { + } elseif (in_array($filter->operator(), ['AND NOT', 'OR NOT'], true)) { // Special case if we start with a negation (there is already the default AND before) $search .= ' NOT'; } diff --git a/tests/app/Models/SearchTest.php b/tests/app/Models/SearchTest.php index e4457fedc..e01830314 100644 --- a/tests/app/Models/SearchTest.php +++ b/tests/app/Models/SearchTest.php @@ -285,12 +285,59 @@ class SearchTest extends PHPUnit\Framework\TestCase { } /** + * @dataProvider provideAddOrParentheses + */ + public function test__addOrParentheses(string $input, string $output): void { + self::assertEquals($output, FreshRSS_BooleanSearch::addOrParentheses($input)); + } + + /** @return array<array{string,string}> */ + public function provideAddOrParentheses(): array { + return [ + ['ab', 'ab'], + ['ab cd', 'ab cd'], + ['!ab -cd', '!ab -cd'], + ['ab OR cd', '(ab) OR (cd)'], + ['!ab OR -cd', '(!ab) OR (-cd)'], + ['ab cd OR ef OR "gh ij"', '(ab cd) OR (ef) OR ("gh ij")'], + ['ab (!cd)', 'ab (!cd)'], + ]; + } + + /** + * @dataProvider provideconsistentOrParentheses + */ + public function test__consistentOrParentheses(string $input, string $output): void { + self::assertEquals($output, FreshRSS_BooleanSearch::consistentOrParentheses($input)); + } + + /** @return array<array{string,string}> */ + public function provideconsistentOrParentheses(): array { + return [ + ['ab cd ef', 'ab cd ef'], + ['(ab cd ef)', '(ab cd ef)'], + ['("ab cd" ef)', '("ab cd" ef)'], + ['"ab cd" (ef gh) "ij kl"', '"ab cd" (ef gh) "ij kl"'], + ['ab (!cd)', 'ab (!cd)'], + ['ab !(cd)', 'ab !(cd)'], + ['(ab) -(cd)', '(ab) -(cd)'], + ['ab cd OR ef OR "gh ij"', 'ab cd OR ef OR "gh ij"'], + ['"plain or text" OR (cd)', '("plain or text") OR (cd)'], + ['(ab) OR cd OR ef OR (gh)', '(ab) OR (cd) OR (ef) OR (gh)'], + ['(ab (cd OR ef)) OR gh OR ij OR (kl)', '(ab (cd OR ef)) OR (gh) OR (ij) OR (kl)'], + ['(ab (cd OR ef OR (gh))) OR ij', '(ab ((cd) OR (ef) OR (gh))) OR (ij)'], + ['(ab (!cd OR ef OR (gh))) OR ij', '(ab ((!cd) OR (ef) OR (gh))) OR (ij)'], + ['(ab !(cd OR ef OR !(gh))) OR ij', '(ab !((cd) OR (ef) OR !(gh))) OR (ij)'], + ]; + } + + /** * @dataProvider provideParentheses * @param array<string> $values */ - public function test__construct_parentheses(string $input, string $sql, array $values): void { + public function test__parentheses(string $input, string $sql, array $values): void { [$filterValues, $filterSearch] = FreshRSS_EntryDAOPGSQL::sqlBooleanSearch('e.', new FreshRSS_BooleanSearch($input)); - self::assertEquals($sql, $filterSearch); + self::assertEquals(trim($sql), trim($filterSearch)); self::assertEquals($values, $filterValues); } @@ -337,6 +384,69 @@ class SearchTest extends PHPUnit\Framework\TestCase { '(e.title LIKE ? )', ['%"hello world"%'], ], + [ + '(ab) OR (cd) OR (ef)', + '(((e.title LIKE ? OR e.content LIKE ?) )) OR (((e.title LIKE ? OR e.content LIKE ?) )) OR (((e.title LIKE ? OR e.content LIKE ?) ))', + ['%ab%', '%ab%', '%cd%', '%cd%', '%ef%', '%ef%'], + ], + [ + '("plain or text") OR (cd)', + '(((e.title LIKE ? OR e.content LIKE ?) )) OR (((e.title LIKE ? OR e.content LIKE ?) ))', + ['%plain or text%', '%plain or text%', '%cd%', '%cd%'], + ], + [ + '"plain or text" OR cd', + '((e.title LIKE ? OR e.content LIKE ?) ) OR ((e.title LIKE ? OR e.content LIKE ?) )', + ['%plain or text%', '%plain or text%', '%cd%', '%cd%'], + ], + [ + '"plain OR text" OR cd', + '((e.title LIKE ? OR e.content LIKE ?) ) OR ((e.title LIKE ? OR e.content LIKE ?) ) ', + ['%plain OR text%', '%plain OR text%', '%cd%', '%cd%'], + ], + [ + 'ab OR cd OR (ef)', + '(((e.title LIKE ? OR e.content LIKE ?) )) OR (((e.title LIKE ? OR e.content LIKE ?) )) OR (((e.title LIKE ? OR e.content LIKE ?) )) ', + ['%ab%', '%ab%', '%cd%', '%cd%', '%ef%', '%ef%'], + ], + [ + 'ab OR cd OR ef', + '((e.title LIKE ? OR e.content LIKE ?) ) OR ((e.title LIKE ? OR e.content LIKE ?) ) OR ((e.title LIKE ? OR e.content LIKE ?) )', + ['%ab%', '%ab%', '%cd%', '%cd%', '%ef%', '%ef%'], + ], + [ + '(ab) cd OR ef OR (gh)', + '(((e.title LIKE ? OR e.content LIKE ?) )) AND (((e.title LIKE ? OR e.content LIKE ?) )) ' . + 'OR (((e.title LIKE ? OR e.content LIKE ?) )) OR (((e.title LIKE ? OR e.content LIKE ?) ))', + ['%ab%', '%ab%', '%cd%', '%cd%', '%ef%', '%ef%', '%gh%', '%gh%'], + ], + [ + '(ab) OR cd OR ef OR (gh)', + '(((e.title LIKE ? OR e.content LIKE ?) )) OR (((e.title LIKE ? OR e.content LIKE ?) )) ' . + 'OR (((e.title LIKE ? OR e.content LIKE ?) )) OR (((e.title LIKE ? OR e.content LIKE ?) ))', + ['%ab%', '%ab%', '%cd%', '%cd%', '%ef%', '%ef%', '%gh%', '%gh%'], + ], + [ + 'ab OR (!(cd OR ef))', + '(((e.title LIKE ? OR e.content LIKE ?) )) OR (NOT (((e.title LIKE ? OR e.content LIKE ?) ) OR ((e.title LIKE ? OR e.content LIKE ?) )))', + ['%ab%', '%ab%', '%cd%', '%cd%', '%ef%', '%ef%'], + ], + [ + 'ab !(cd OR ef)', + '(((e.title LIKE ? OR e.content LIKE ?) )) AND NOT (((e.title LIKE ? OR e.content LIKE ?) ) OR ((e.title LIKE ? OR e.content LIKE ?) ))', + ['%ab%', '%ab%', '%cd%', '%cd%', '%ef%', '%ef%'], + ], + [ + 'ab OR !(cd OR ef)', + '(((e.title LIKE ? OR e.content LIKE ?) )) OR NOT (((e.title LIKE ? OR e.content LIKE ?) ) OR ((e.title LIKE ? OR e.content LIKE ?) ))', + ['%ab%', '%ab%', '%cd%', '%cd%', '%ef%', '%ef%'], + ], + [ + '(ab (!cd OR ef OR (gh))) OR !(ij OR kl)', + '((((e.title LIKE ? OR e.content LIKE ?) )) AND (((e.title NOT LIKE ? AND e.content NOT LIKE ? )) OR (((e.title LIKE ? OR e.content LIKE ?) )) ' . + 'OR (((e.title LIKE ? OR e.content LIKE ?) )))) OR NOT (((e.title LIKE ? OR e.content LIKE ?) ) OR ((e.title LIKE ? OR e.content LIKE ?) ))', + ['%ab%', '%ab%', '%cd%', '%cd%', '%ef%', '%ef%', '%gh%', '%gh%', '%ij%', '%ij%', '%kl%', '%kl%'], + ], ]; } } |
