diff options
Diffstat (limited to 'app/Models/BooleanSearch.php')
| -rw-r--r-- | app/Models/BooleanSearch.php | 129 |
1 files changed, 118 insertions, 11 deletions
diff --git a/app/Models/BooleanSearch.php b/app/Models/BooleanSearch.php index 692738107..88069c461 100644 --- a/app/Models/BooleanSearch.php +++ b/app/Models/BooleanSearch.php @@ -11,11 +11,11 @@ class FreshRSS_BooleanSearch { private array $searches = []; /** - * @phpstan-var 'AND'|'OR'|'AND NOT' + * @phpstan-var 'AND'|'OR'|'AND NOT'|'OR NOT' */ private string $operator; - /** @param 'AND'|'OR'|'AND NOT' $operator */ + /** @param 'AND'|'OR'|'AND NOT'|'OR NOT' $operator */ public function __construct(string $input, int $level = 0, string $operator = 'AND', bool $allowUserQueries = true) { $this->operator = $operator; $input = trim($input); @@ -38,6 +38,8 @@ class FreshRSS_BooleanSearch { } $this->raw_input = $input; + $input = self::consistentOrParentheses($input); + // Either parse everything as a series of BooleanSearch’s combined by implicit AND // or parse everything as a series of Search’s combined by explicit OR $this->parseParentheses($input, $level) || $this->parseOrSegments($input); @@ -130,6 +132,107 @@ class FreshRSS_BooleanSearch { return $input; } + /** + * Example: 'ab cd OR ef OR "gh ij"' becomes '(ab cd) OR (ef) OR ("gh ij")' + */ + public static function addOrParentheses(string $input): string { + $input = trim($input); + if ($input === '') { + return ''; + } + $splits = preg_split('/\b(OR)\b/i', $input, -1, PREG_SPLIT_DELIM_CAPTURE) ?: []; + $ns = count($splits); + if ($ns <= 1) { + return $input; + } + $result = ''; + $segment = ''; + for ($i = 0; $i < $ns; $i++) { + $segment .= $splits[$i]; + if (trim($segment) === '') { + $segment = ''; + } elseif (strcasecmp($segment, 'OR') === 0) { + $result .= $segment . ' '; + $segment = ''; + } else { + $quotes = substr_count($segment, '"') + substr_count($segment, '"'); + if ($quotes % 2 === 0) { + $segment = trim($segment); + if (in_array($segment, ['!', '-'], true)) { + $result .= $segment; + } else { + $result .= '(' . $segment . ') '; + } + $segment = ''; + } + } + } + $segment = trim($segment); + if (in_array($segment, ['!', '-'], true)) { + $result .= $segment; + } elseif ($segment !== '') { + $result .= '(' . $segment . ')'; + } + return trim($result); + } + + /** + * If the query contains a mix of `OR` expressions with and without parentheses, + * then add parentheses to make the query consistent. + * Example: '(ab (cd OR ef)) OR gh OR ij OR (kl)' becomes '(ab ((cd) OR (ef))) OR (gh) OR (ij) OR (kl)' + */ + public static function consistentOrParentheses(string $input): string { + if (!preg_match('/(?<!\\\\)\\(/', $input)) { + // No unescaped parentheses in the input + return trim($input); + } + $parenthesesCount = 0; + $result = ''; + $segment = ''; + $length = strlen($input); + + for ($i = 0; $i < $length; $i++) { + $c = $input[$i]; + $backslashed = $i >= 1 ? $input[$i - 1] === '\\' : false; + if (!$backslashed) { + if ($c === '(') { + if ($parenthesesCount === 0) { + if ($segment !== '') { + $result = rtrim($result) . ' ' . self::addOrParentheses($segment); + $negation = preg_match('/[!-]$/', $result); + if (!$negation) { + $result .= ' '; + } + $segment = ''; + } + $c = ''; + } + $parenthesesCount++; + } elseif ($c === ')') { + $parenthesesCount--; + if ($parenthesesCount === 0) { + $segment = self::consistentOrParentheses($segment); + if ($segment !== '') { + $result .= '(' . $segment . ')'; + $segment = ''; + } + $c = ''; + } + } + } + $segment .= $c; + } + if (trim($segment) !== '') { + $result = rtrim($result); + $negation = preg_match('/[!-]$/', $segment); + if (!$negation) { + $result .= ' '; + } + $result .= self::addOrParentheses($segment); + } + return trim($result); + } + /** @return bool True if some parenthesis logic took over, false otherwise */ private function parseParentheses(string $input, int $level): bool { $input = trim($input); @@ -146,9 +249,14 @@ class FreshRSS_BooleanSearch { $hasParenthesis = true; $before = trim($before); - if (preg_match('/[!-]$/i', $before)) { + if (preg_match('/[!-]$/', $before)) { // Trim trailing negation - $before = substr($before, 0, -1); + $before = rtrim($before, ' !-'); + $isOr = preg_match('/\bOR$/i', $before); + if ($isOr) { + // Trim trailing OR + $before = substr($before, 0, -2); + } // The text prior to the negation is a BooleanSearch $searchBefore = new FreshRSS_BooleanSearch($before, $level + 1, $nextOperator); @@ -157,8 +265,8 @@ class FreshRSS_BooleanSearch { } $before = ''; - // The next BooleanSearch will have to be combined with AND NOT instead of default AND - $nextOperator = 'AND NOT'; + // The next BooleanSearch will have to be combined with AND NOT or OR NOT instead of default AND + $nextOperator = $isOr ? 'OR NOT' : 'AND NOT'; } elseif (preg_match('/\bOR$/i', $before)) { // Trim trailing OR $before = substr($before, 0, -2); @@ -212,7 +320,7 @@ class FreshRSS_BooleanSearch { $i++; } // $sub = trim($sub); - // if ($sub != '') { + // if ($sub !== '') { // // TODO: Consider throwing an error or warning in case of non-matching parenthesis // } // } elseif ($c === ')') { @@ -249,12 +357,11 @@ class FreshRSS_BooleanSearch { return; } $splits = preg_split('/\b(OR)\b/i', $input, -1, PREG_SPLIT_DELIM_CAPTURE) ?: []; - $segment = ''; $ns = count($splits); for ($i = 0; $i < $ns; $i++) { $segment = $segment . $splits[$i]; - if (trim($segment) == '' || strcasecmp($segment, 'OR') === 0) { + if (trim($segment) === '' || strcasecmp($segment, 'OR') === 0) { $segment = ''; } else { $quotes = substr_count($segment, '"') + substr_count($segment, '"'); @@ -266,7 +373,7 @@ class FreshRSS_BooleanSearch { } } $segment = trim($segment); - if ($segment != '') { + if ($segment !== '') { $this->searches[] = new FreshRSS_Search($segment); } } @@ -280,7 +387,7 @@ class FreshRSS_BooleanSearch { return $this->searches; } - /** @return 'AND'|'OR'|'AND NOT' depending on how this BooleanSearch should be combined */ + /** @return 'AND'|'OR'|'AND NOT'|'OR NOT' depending on how this BooleanSearch should be combined */ public function operator(): string { return $this->operator; } |
