aboutsummaryrefslogtreecommitdiff
path: root/app/Models/BooleanSearch.php
diff options
context:
space:
mode:
authorGravatar Alexandre Alapetite <alexandre@alapetite.fr> 2024-08-01 19:44:58 +0200
committerGravatar GitHub <noreply@github.com> 2024-08-01 19:44:58 +0200
commit58de38bd737d2dba617944b54a98d4bdb5810588 (patch)
treeff197baaabd4b3e12b29eb74f3d818d8f4a04e9a /app/Models/BooleanSearch.php
parent666e7b27ce4f9c8254e76373572f220ddfb2fd37 (diff)
Fix parentheses for complex `OR` boolean search expressions (#6672)
* Fix OR parentheses * Pass all tests * Forgotten comment * Minor whitespace * Fix several cases envolving negation * Line length * Fix `OR NOT`
Diffstat (limited to 'app/Models/BooleanSearch.php')
-rw-r--r--app/Models/BooleanSearch.php129
1 files changed, 118 insertions, 11 deletions
diff --git a/app/Models/BooleanSearch.php b/app/Models/BooleanSearch.php
index 692738107..88069c461 100644
--- a/app/Models/BooleanSearch.php
+++ b/app/Models/BooleanSearch.php
@@ -11,11 +11,11 @@ class FreshRSS_BooleanSearch {
private array $searches = [];
/**
- * @phpstan-var 'AND'|'OR'|'AND NOT'
+ * @phpstan-var 'AND'|'OR'|'AND NOT'|'OR NOT'
*/
private string $operator;
- /** @param 'AND'|'OR'|'AND NOT' $operator */
+ /** @param 'AND'|'OR'|'AND NOT'|'OR NOT' $operator */
public function __construct(string $input, int $level = 0, string $operator = 'AND', bool $allowUserQueries = true) {
$this->operator = $operator;
$input = trim($input);
@@ -38,6 +38,8 @@ class FreshRSS_BooleanSearch {
}
$this->raw_input = $input;
+ $input = self::consistentOrParentheses($input);
+
// Either parse everything as a series of BooleanSearch’s combined by implicit AND
// or parse everything as a series of Search’s combined by explicit OR
$this->parseParentheses($input, $level) || $this->parseOrSegments($input);
@@ -130,6 +132,107 @@ class FreshRSS_BooleanSearch {
return $input;
}
+ /**
+ * Example: 'ab cd OR ef OR "gh ij"' becomes '(ab cd) OR (ef) OR ("gh ij")'
+ */
+ public static function addOrParentheses(string $input): string {
+ $input = trim($input);
+ if ($input === '') {
+ return '';
+ }
+ $splits = preg_split('/\b(OR)\b/i', $input, -1, PREG_SPLIT_DELIM_CAPTURE) ?: [];
+ $ns = count($splits);
+ if ($ns <= 1) {
+ return $input;
+ }
+ $result = '';
+ $segment = '';
+ for ($i = 0; $i < $ns; $i++) {
+ $segment .= $splits[$i];
+ if (trim($segment) === '') {
+ $segment = '';
+ } elseif (strcasecmp($segment, 'OR') === 0) {
+ $result .= $segment . ' ';
+ $segment = '';
+ } else {
+ $quotes = substr_count($segment, '"') + substr_count($segment, '&quot;');
+ if ($quotes % 2 === 0) {
+ $segment = trim($segment);
+ if (in_array($segment, ['!', '-'], true)) {
+ $result .= $segment;
+ } else {
+ $result .= '(' . $segment . ') ';
+ }
+ $segment = '';
+ }
+ }
+ }
+ $segment = trim($segment);
+ if (in_array($segment, ['!', '-'], true)) {
+ $result .= $segment;
+ } elseif ($segment !== '') {
+ $result .= '(' . $segment . ')';
+ }
+ return trim($result);
+ }
+
+ /**
+ * If the query contains a mix of `OR` expressions with and without parentheses,
+ * then add parentheses to make the query consistent.
+ * Example: '(ab (cd OR ef)) OR gh OR ij OR (kl)' becomes '(ab ((cd) OR (ef))) OR (gh) OR (ij) OR (kl)'
+ */
+ public static function consistentOrParentheses(string $input): string {
+ if (!preg_match('/(?<!\\\\)\\(/', $input)) {
+ // No unescaped parentheses in the input
+ return trim($input);
+ }
+ $parenthesesCount = 0;
+ $result = '';
+ $segment = '';
+ $length = strlen($input);
+
+ for ($i = 0; $i < $length; $i++) {
+ $c = $input[$i];
+ $backslashed = $i >= 1 ? $input[$i - 1] === '\\' : false;
+ if (!$backslashed) {
+ if ($c === '(') {
+ if ($parenthesesCount === 0) {
+ if ($segment !== '') {
+ $result = rtrim($result) . ' ' . self::addOrParentheses($segment);
+ $negation = preg_match('/[!-]$/', $result);
+ if (!$negation) {
+ $result .= ' ';
+ }
+ $segment = '';
+ }
+ $c = '';
+ }
+ $parenthesesCount++;
+ } elseif ($c === ')') {
+ $parenthesesCount--;
+ if ($parenthesesCount === 0) {
+ $segment = self::consistentOrParentheses($segment);
+ if ($segment !== '') {
+ $result .= '(' . $segment . ')';
+ $segment = '';
+ }
+ $c = '';
+ }
+ }
+ }
+ $segment .= $c;
+ }
+ if (trim($segment) !== '') {
+ $result = rtrim($result);
+ $negation = preg_match('/[!-]$/', $segment);
+ if (!$negation) {
+ $result .= ' ';
+ }
+ $result .= self::addOrParentheses($segment);
+ }
+ return trim($result);
+ }
+
/** @return bool True if some parenthesis logic took over, false otherwise */
private function parseParentheses(string $input, int $level): bool {
$input = trim($input);
@@ -146,9 +249,14 @@ class FreshRSS_BooleanSearch {
$hasParenthesis = true;
$before = trim($before);
- if (preg_match('/[!-]$/i', $before)) {
+ if (preg_match('/[!-]$/', $before)) {
// Trim trailing negation
- $before = substr($before, 0, -1);
+ $before = rtrim($before, ' !-');
+ $isOr = preg_match('/\bOR$/i', $before);
+ if ($isOr) {
+ // Trim trailing OR
+ $before = substr($before, 0, -2);
+ }
// The text prior to the negation is a BooleanSearch
$searchBefore = new FreshRSS_BooleanSearch($before, $level + 1, $nextOperator);
@@ -157,8 +265,8 @@ class FreshRSS_BooleanSearch {
}
$before = '';
- // The next BooleanSearch will have to be combined with AND NOT instead of default AND
- $nextOperator = 'AND NOT';
+ // The next BooleanSearch will have to be combined with AND NOT or OR NOT instead of default AND
+ $nextOperator = $isOr ? 'OR NOT' : 'AND NOT';
} elseif (preg_match('/\bOR$/i', $before)) {
// Trim trailing OR
$before = substr($before, 0, -2);
@@ -212,7 +320,7 @@ class FreshRSS_BooleanSearch {
$i++;
}
// $sub = trim($sub);
- // if ($sub != '') {
+ // if ($sub !== '') {
// // TODO: Consider throwing an error or warning in case of non-matching parenthesis
// }
// } elseif ($c === ')') {
@@ -249,12 +357,11 @@ class FreshRSS_BooleanSearch {
return;
}
$splits = preg_split('/\b(OR)\b/i', $input, -1, PREG_SPLIT_DELIM_CAPTURE) ?: [];
-
$segment = '';
$ns = count($splits);
for ($i = 0; $i < $ns; $i++) {
$segment = $segment . $splits[$i];
- if (trim($segment) == '' || strcasecmp($segment, 'OR') === 0) {
+ if (trim($segment) === '' || strcasecmp($segment, 'OR') === 0) {
$segment = '';
} else {
$quotes = substr_count($segment, '"') + substr_count($segment, '&quot;');
@@ -266,7 +373,7 @@ class FreshRSS_BooleanSearch {
}
}
$segment = trim($segment);
- if ($segment != '') {
+ if ($segment !== '') {
$this->searches[] = new FreshRSS_Search($segment);
}
}
@@ -280,7 +387,7 @@ class FreshRSS_BooleanSearch {
return $this->searches;
}
- /** @return 'AND'|'OR'|'AND NOT' depending on how this BooleanSearch should be combined */
+ /** @return 'AND'|'OR'|'AND NOT'|'OR NOT' depending on how this BooleanSearch should be combined */
public function operator(): string {
return $this->operator;
}