aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGravatar Alexandre Alapetite <alexandre@alapetite.fr> 2024-08-01 19:44:58 +0200
committerGravatar GitHub <noreply@github.com> 2024-08-01 19:44:58 +0200
commit58de38bd737d2dba617944b54a98d4bdb5810588 (patch)
treeff197baaabd4b3e12b29eb74f3d818d8f4a04e9a
parent666e7b27ce4f9c8254e76373572f220ddfb2fd37 (diff)
Fix parentheses for complex `OR` boolean search expressions (#6672)
* Fix OR parentheses * Pass all tests * Forgotten comment * Minor whitespace * Fix several cases envolving negation * Line length * Fix `OR NOT`
-rw-r--r--app/Models/BooleanSearch.php129
-rw-r--r--app/Models/Entry.php20
-rw-r--r--app/Models/EntryDAO.php2
-rw-r--r--tests/app/Models/SearchTest.php114
4 files changed, 245 insertions, 20 deletions
diff --git a/app/Models/BooleanSearch.php b/app/Models/BooleanSearch.php
index 692738107..88069c461 100644
--- a/app/Models/BooleanSearch.php
+++ b/app/Models/BooleanSearch.php
@@ -11,11 +11,11 @@ class FreshRSS_BooleanSearch {
private array $searches = [];
/**
- * @phpstan-var 'AND'|'OR'|'AND NOT'
+ * @phpstan-var 'AND'|'OR'|'AND NOT'|'OR NOT'
*/
private string $operator;
- /** @param 'AND'|'OR'|'AND NOT' $operator */
+ /** @param 'AND'|'OR'|'AND NOT'|'OR NOT' $operator */
public function __construct(string $input, int $level = 0, string $operator = 'AND', bool $allowUserQueries = true) {
$this->operator = $operator;
$input = trim($input);
@@ -38,6 +38,8 @@ class FreshRSS_BooleanSearch {
}
$this->raw_input = $input;
+ $input = self::consistentOrParentheses($input);
+
// Either parse everything as a series of BooleanSearch’s combined by implicit AND
// or parse everything as a series of Search’s combined by explicit OR
$this->parseParentheses($input, $level) || $this->parseOrSegments($input);
@@ -130,6 +132,107 @@ class FreshRSS_BooleanSearch {
return $input;
}
+ /**
+ * Example: 'ab cd OR ef OR "gh ij"' becomes '(ab cd) OR (ef) OR ("gh ij")'
+ */
+ public static function addOrParentheses(string $input): string {
+ $input = trim($input);
+ if ($input === '') {
+ return '';
+ }
+ $splits = preg_split('/\b(OR)\b/i', $input, -1, PREG_SPLIT_DELIM_CAPTURE) ?: [];
+ $ns = count($splits);
+ if ($ns <= 1) {
+ return $input;
+ }
+ $result = '';
+ $segment = '';
+ for ($i = 0; $i < $ns; $i++) {
+ $segment .= $splits[$i];
+ if (trim($segment) === '') {
+ $segment = '';
+ } elseif (strcasecmp($segment, 'OR') === 0) {
+ $result .= $segment . ' ';
+ $segment = '';
+ } else {
+ $quotes = substr_count($segment, '"') + substr_count($segment, '&quot;');
+ if ($quotes % 2 === 0) {
+ $segment = trim($segment);
+ if (in_array($segment, ['!', '-'], true)) {
+ $result .= $segment;
+ } else {
+ $result .= '(' . $segment . ') ';
+ }
+ $segment = '';
+ }
+ }
+ }
+ $segment = trim($segment);
+ if (in_array($segment, ['!', '-'], true)) {
+ $result .= $segment;
+ } elseif ($segment !== '') {
+ $result .= '(' . $segment . ')';
+ }
+ return trim($result);
+ }
+
+ /**
+ * If the query contains a mix of `OR` expressions with and without parentheses,
+ * then add parentheses to make the query consistent.
+ * Example: '(ab (cd OR ef)) OR gh OR ij OR (kl)' becomes '(ab ((cd) OR (ef))) OR (gh) OR (ij) OR (kl)'
+ */
+ public static function consistentOrParentheses(string $input): string {
+ if (!preg_match('/(?<!\\\\)\\(/', $input)) {
+ // No unescaped parentheses in the input
+ return trim($input);
+ }
+ $parenthesesCount = 0;
+ $result = '';
+ $segment = '';
+ $length = strlen($input);
+
+ for ($i = 0; $i < $length; $i++) {
+ $c = $input[$i];
+ $backslashed = $i >= 1 ? $input[$i - 1] === '\\' : false;
+ if (!$backslashed) {
+ if ($c === '(') {
+ if ($parenthesesCount === 0) {
+ if ($segment !== '') {
+ $result = rtrim($result) . ' ' . self::addOrParentheses($segment);
+ $negation = preg_match('/[!-]$/', $result);
+ if (!$negation) {
+ $result .= ' ';
+ }
+ $segment = '';
+ }
+ $c = '';
+ }
+ $parenthesesCount++;
+ } elseif ($c === ')') {
+ $parenthesesCount--;
+ if ($parenthesesCount === 0) {
+ $segment = self::consistentOrParentheses($segment);
+ if ($segment !== '') {
+ $result .= '(' . $segment . ')';
+ $segment = '';
+ }
+ $c = '';
+ }
+ }
+ }
+ $segment .= $c;
+ }
+ if (trim($segment) !== '') {
+ $result = rtrim($result);
+ $negation = preg_match('/[!-]$/', $segment);
+ if (!$negation) {
+ $result .= ' ';
+ }
+ $result .= self::addOrParentheses($segment);
+ }
+ return trim($result);
+ }
+
/** @return bool True if some parenthesis logic took over, false otherwise */
private function parseParentheses(string $input, int $level): bool {
$input = trim($input);
@@ -146,9 +249,14 @@ class FreshRSS_BooleanSearch {
$hasParenthesis = true;
$before = trim($before);
- if (preg_match('/[!-]$/i', $before)) {
+ if (preg_match('/[!-]$/', $before)) {
// Trim trailing negation
- $before = substr($before, 0, -1);
+ $before = rtrim($before, ' !-');
+ $isOr = preg_match('/\bOR$/i', $before);
+ if ($isOr) {
+ // Trim trailing OR
+ $before = substr($before, 0, -2);
+ }
// The text prior to the negation is a BooleanSearch
$searchBefore = new FreshRSS_BooleanSearch($before, $level + 1, $nextOperator);
@@ -157,8 +265,8 @@ class FreshRSS_BooleanSearch {
}
$before = '';
- // The next BooleanSearch will have to be combined with AND NOT instead of default AND
- $nextOperator = 'AND NOT';
+ // The next BooleanSearch will have to be combined with AND NOT or OR NOT instead of default AND
+ $nextOperator = $isOr ? 'OR NOT' : 'AND NOT';
} elseif (preg_match('/\bOR$/i', $before)) {
// Trim trailing OR
$before = substr($before, 0, -2);
@@ -212,7 +320,7 @@ class FreshRSS_BooleanSearch {
$i++;
}
// $sub = trim($sub);
- // if ($sub != '') {
+ // if ($sub !== '') {
// // TODO: Consider throwing an error or warning in case of non-matching parenthesis
// }
// } elseif ($c === ')') {
@@ -249,12 +357,11 @@ class FreshRSS_BooleanSearch {
return;
}
$splits = preg_split('/\b(OR)\b/i', $input, -1, PREG_SPLIT_DELIM_CAPTURE) ?: [];
-
$segment = '';
$ns = count($splits);
for ($i = 0; $i < $ns; $i++) {
$segment = $segment . $splits[$i];
- if (trim($segment) == '' || strcasecmp($segment, 'OR') === 0) {
+ if (trim($segment) === '' || strcasecmp($segment, 'OR') === 0) {
$segment = '';
} else {
$quotes = substr_count($segment, '"') + substr_count($segment, '&quot;');
@@ -266,7 +373,7 @@ class FreshRSS_BooleanSearch {
}
}
$segment = trim($segment);
- if ($segment != '') {
+ if ($segment !== '') {
$this->searches[] = new FreshRSS_Search($segment);
}
}
@@ -280,7 +387,7 @@ class FreshRSS_BooleanSearch {
return $this->searches;
}
- /** @return 'AND'|'OR'|'AND NOT' depending on how this BooleanSearch should be combined */
+ /** @return 'AND'|'OR'|'AND NOT'|'OR NOT' depending on how this BooleanSearch should be combined */
public function operator(): string {
return $this->operator;
}
diff --git a/app/Models/Entry.php b/app/Models/Entry.php
index e4d728ba4..ada6e9944 100644
--- a/app/Models/Entry.php
+++ b/app/Models/Entry.php
@@ -577,12 +577,20 @@ HTML;
foreach ($booleanSearch->searches() as $filter) {
if ($filter instanceof FreshRSS_BooleanSearch) {
// BooleanSearches are combined by AND (default) or OR or AND NOT (special cases) operators and are recursive
- if ($filter->operator() === 'OR') {
- $ok |= $this->matches($filter);
- } elseif ($filter->operator() === 'AND NOT') {
- $ok &= !$this->matches($filter);
- } else { // AND
- $ok &= $this->matches($filter);
+ switch ($filter->operator()) {
+ case 'OR':
+ $ok |= $this->matches($filter);
+ break;
+ case 'OR NOT':
+ $ok |= !$this->matches($filter);
+ break;
+ case 'AND NOT':
+ $ok &= !$this->matches($filter);
+ break;
+ case 'AND':
+ default:
+ $ok &= $this->matches($filter);
+ break;
}
} elseif ($filter instanceof FreshRSS_Search) {
// Searches are combined by OR and are not recursive
diff --git a/app/Models/EntryDAO.php b/app/Models/EntryDAO.php
index ec627dda1..ba0cf1970 100644
--- a/app/Models/EntryDAO.php
+++ b/app/Models/EntryDAO.php
@@ -762,7 +762,7 @@ SQL;
if ($filterSearch !== '') {
if ($search !== '') {
$search .= $filter->operator();
- } elseif ($filter->operator() === 'AND NOT') {
+ } elseif (in_array($filter->operator(), ['AND NOT', 'OR NOT'], true)) {
// Special case if we start with a negation (there is already the default AND before)
$search .= ' NOT';
}
diff --git a/tests/app/Models/SearchTest.php b/tests/app/Models/SearchTest.php
index e4457fedc..e01830314 100644
--- a/tests/app/Models/SearchTest.php
+++ b/tests/app/Models/SearchTest.php
@@ -285,12 +285,59 @@ class SearchTest extends PHPUnit\Framework\TestCase {
}
/**
+ * @dataProvider provideAddOrParentheses
+ */
+ public function test__addOrParentheses(string $input, string $output): void {
+ self::assertEquals($output, FreshRSS_BooleanSearch::addOrParentheses($input));
+ }
+
+ /** @return array<array{string,string}> */
+ public function provideAddOrParentheses(): array {
+ return [
+ ['ab', 'ab'],
+ ['ab cd', 'ab cd'],
+ ['!ab -cd', '!ab -cd'],
+ ['ab OR cd', '(ab) OR (cd)'],
+ ['!ab OR -cd', '(!ab) OR (-cd)'],
+ ['ab cd OR ef OR "gh ij"', '(ab cd) OR (ef) OR ("gh ij")'],
+ ['ab (!cd)', 'ab (!cd)'],
+ ];
+ }
+
+ /**
+ * @dataProvider provideconsistentOrParentheses
+ */
+ public function test__consistentOrParentheses(string $input, string $output): void {
+ self::assertEquals($output, FreshRSS_BooleanSearch::consistentOrParentheses($input));
+ }
+
+ /** @return array<array{string,string}> */
+ public function provideconsistentOrParentheses(): array {
+ return [
+ ['ab cd ef', 'ab cd ef'],
+ ['(ab cd ef)', '(ab cd ef)'],
+ ['("ab cd" ef)', '("ab cd" ef)'],
+ ['"ab cd" (ef gh) "ij kl"', '"ab cd" (ef gh) "ij kl"'],
+ ['ab (!cd)', 'ab (!cd)'],
+ ['ab !(cd)', 'ab !(cd)'],
+ ['(ab) -(cd)', '(ab) -(cd)'],
+ ['ab cd OR ef OR "gh ij"', 'ab cd OR ef OR "gh ij"'],
+ ['"plain or text" OR (cd)', '("plain or text") OR (cd)'],
+ ['(ab) OR cd OR ef OR (gh)', '(ab) OR (cd) OR (ef) OR (gh)'],
+ ['(ab (cd OR ef)) OR gh OR ij OR (kl)', '(ab (cd OR ef)) OR (gh) OR (ij) OR (kl)'],
+ ['(ab (cd OR ef OR (gh))) OR ij', '(ab ((cd) OR (ef) OR (gh))) OR (ij)'],
+ ['(ab (!cd OR ef OR (gh))) OR ij', '(ab ((!cd) OR (ef) OR (gh))) OR (ij)'],
+ ['(ab !(cd OR ef OR !(gh))) OR ij', '(ab !((cd) OR (ef) OR !(gh))) OR (ij)'],
+ ];
+ }
+
+ /**
* @dataProvider provideParentheses
* @param array<string> $values
*/
- public function test__construct_parentheses(string $input, string $sql, array $values): void {
+ public function test__parentheses(string $input, string $sql, array $values): void {
[$filterValues, $filterSearch] = FreshRSS_EntryDAOPGSQL::sqlBooleanSearch('e.', new FreshRSS_BooleanSearch($input));
- self::assertEquals($sql, $filterSearch);
+ self::assertEquals(trim($sql), trim($filterSearch));
self::assertEquals($values, $filterValues);
}
@@ -337,6 +384,69 @@ class SearchTest extends PHPUnit\Framework\TestCase {
'(e.title LIKE ? )',
['%"hello world"%'],
],
+ [
+ '(ab) OR (cd) OR (ef)',
+ '(((e.title LIKE ? OR e.content LIKE ?) )) OR (((e.title LIKE ? OR e.content LIKE ?) )) OR (((e.title LIKE ? OR e.content LIKE ?) ))',
+ ['%ab%', '%ab%', '%cd%', '%cd%', '%ef%', '%ef%'],
+ ],
+ [
+ '("plain or text") OR (cd)',
+ '(((e.title LIKE ? OR e.content LIKE ?) )) OR (((e.title LIKE ? OR e.content LIKE ?) ))',
+ ['%plain or text%', '%plain or text%', '%cd%', '%cd%'],
+ ],
+ [
+ '"plain or text" OR cd',
+ '((e.title LIKE ? OR e.content LIKE ?) ) OR ((e.title LIKE ? OR e.content LIKE ?) )',
+ ['%plain or text%', '%plain or text%', '%cd%', '%cd%'],
+ ],
+ [
+ '"plain OR text" OR cd',
+ '((e.title LIKE ? OR e.content LIKE ?) ) OR ((e.title LIKE ? OR e.content LIKE ?) ) ',
+ ['%plain OR text%', '%plain OR text%', '%cd%', '%cd%'],
+ ],
+ [
+ 'ab OR cd OR (ef)',
+ '(((e.title LIKE ? OR e.content LIKE ?) )) OR (((e.title LIKE ? OR e.content LIKE ?) )) OR (((e.title LIKE ? OR e.content LIKE ?) )) ',
+ ['%ab%', '%ab%', '%cd%', '%cd%', '%ef%', '%ef%'],
+ ],
+ [
+ 'ab OR cd OR ef',
+ '((e.title LIKE ? OR e.content LIKE ?) ) OR ((e.title LIKE ? OR e.content LIKE ?) ) OR ((e.title LIKE ? OR e.content LIKE ?) )',
+ ['%ab%', '%ab%', '%cd%', '%cd%', '%ef%', '%ef%'],
+ ],
+ [
+ '(ab) cd OR ef OR (gh)',
+ '(((e.title LIKE ? OR e.content LIKE ?) )) AND (((e.title LIKE ? OR e.content LIKE ?) )) ' .
+ 'OR (((e.title LIKE ? OR e.content LIKE ?) )) OR (((e.title LIKE ? OR e.content LIKE ?) ))',
+ ['%ab%', '%ab%', '%cd%', '%cd%', '%ef%', '%ef%', '%gh%', '%gh%'],
+ ],
+ [
+ '(ab) OR cd OR ef OR (gh)',
+ '(((e.title LIKE ? OR e.content LIKE ?) )) OR (((e.title LIKE ? OR e.content LIKE ?) )) ' .
+ 'OR (((e.title LIKE ? OR e.content LIKE ?) )) OR (((e.title LIKE ? OR e.content LIKE ?) ))',
+ ['%ab%', '%ab%', '%cd%', '%cd%', '%ef%', '%ef%', '%gh%', '%gh%'],
+ ],
+ [
+ 'ab OR (!(cd OR ef))',
+ '(((e.title LIKE ? OR e.content LIKE ?) )) OR (NOT (((e.title LIKE ? OR e.content LIKE ?) ) OR ((e.title LIKE ? OR e.content LIKE ?) )))',
+ ['%ab%', '%ab%', '%cd%', '%cd%', '%ef%', '%ef%'],
+ ],
+ [
+ 'ab !(cd OR ef)',
+ '(((e.title LIKE ? OR e.content LIKE ?) )) AND NOT (((e.title LIKE ? OR e.content LIKE ?) ) OR ((e.title LIKE ? OR e.content LIKE ?) ))',
+ ['%ab%', '%ab%', '%cd%', '%cd%', '%ef%', '%ef%'],
+ ],
+ [
+ 'ab OR !(cd OR ef)',
+ '(((e.title LIKE ? OR e.content LIKE ?) )) OR NOT (((e.title LIKE ? OR e.content LIKE ?) ) OR ((e.title LIKE ? OR e.content LIKE ?) ))',
+ ['%ab%', '%ab%', '%cd%', '%cd%', '%ef%', '%ef%'],
+ ],
+ [
+ '(ab (!cd OR ef OR (gh))) OR !(ij OR kl)',
+ '((((e.title LIKE ? OR e.content LIKE ?) )) AND (((e.title NOT LIKE ? AND e.content NOT LIKE ? )) OR (((e.title LIKE ? OR e.content LIKE ?) )) ' .
+ 'OR (((e.title LIKE ? OR e.content LIKE ?) )))) OR NOT (((e.title LIKE ? OR e.content LIKE ?) ) OR ((e.title LIKE ? OR e.content LIKE ?) ))',
+ ['%ab%', '%ab%', '%cd%', '%cd%', '%ef%', '%ef%', '%gh%', '%gh%', '%ij%', '%ij%', '%kl%', '%kl%'],
+ ],
];
}
}