Skip to content

Commit 0a80916

Browse files
authored
ESQL: Pushdown constructs doing case-insensitive regexes (#128393)
This introduces an optimization to pushdown to Lucense those language constructs that aim at case-insensitive regular expression matching, used with `LIKE` and `RLIKE` operators, such as: * `| WHERE TO_LOWER(field) LIKE "abc*"` * `| WHERE TO_UPPER(field) RLIKE "ABC.*"` These are now pushed as case-insensitive `wildcard` and `regexp` respectively queries down to Lucene. Closes #127479
1 parent cc461af commit 0a80916

File tree

33 files changed

+756
-236
lines changed

33 files changed

+756
-236
lines changed

benchmarks/src/main/java/org/elasticsearch/benchmark/compute/operator/EvalBenchmark.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,9 +48,9 @@
4848
import org.elasticsearch.xpack.esql.expression.function.scalar.math.RoundTo;
4949
import org.elasticsearch.xpack.esql.expression.function.scalar.multivalue.MvMin;
5050
import org.elasticsearch.xpack.esql.expression.function.scalar.nulls.Coalesce;
51-
import org.elasticsearch.xpack.esql.expression.function.scalar.string.RLike;
5251
import org.elasticsearch.xpack.esql.expression.function.scalar.string.ToLower;
5352
import org.elasticsearch.xpack.esql.expression.function.scalar.string.ToUpper;
53+
import org.elasticsearch.xpack.esql.expression.function.scalar.string.regex.RLike;
5454
import org.elasticsearch.xpack.esql.expression.predicate.operator.arithmetic.Add;
5555
import org.elasticsearch.xpack.esql.expression.predicate.operator.comparison.Equals;
5656
import org.elasticsearch.xpack.esql.expression.predicate.operator.comparison.LessThan;

docs/changelog/128393.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
pr: 128393
2+
summary: Pushdown constructs doing case-insensitive regexes
3+
area: ES|QL
4+
type: enhancement
5+
issues:
6+
- 127479

server/src/main/java/org/elasticsearch/TransportVersions.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -272,6 +272,7 @@ static TransportVersion def(int id) {
272272
public static final TransportVersion ML_INFERENCE_VERTEXAI_CHATCOMPLETION_ADDED = def(9_083_0_00);
273273
public static final TransportVersion INFERENCE_CUSTOM_SERVICE_ADDED = def(9_084_0_00);
274274
public static final TransportVersion ESQL_LIMIT_ROW_SIZE = def(9_085_0_00);
275+
public static final TransportVersion ESQL_REGEX_MATCH_WITH_CASE_INSENSITIVITY = def(9_086_0_00);
275276

276277
/*
277278
* STOP! READ THIS FIRST! No, really,

x-pack/plugin/esql-core/src/main/java/org/elasticsearch/xpack/esql/core/expression/predicate/regex/AbstractStringPattern.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,11 +16,11 @@ public abstract class AbstractStringPattern implements StringPattern {
1616

1717
private Automaton automaton;
1818

19-
public abstract Automaton createAutomaton();
19+
public abstract Automaton createAutomaton(boolean ignoreCase);
2020

2121
private Automaton automaton() {
2222
if (automaton == null) {
23-
automaton = createAutomaton();
23+
automaton = createAutomaton(false);
2424
}
2525
return automaton;
2626
}

x-pack/plugin/esql-core/src/main/java/org/elasticsearch/xpack/esql/core/expression/predicate/regex/RLike.java

Lines changed: 0 additions & 35 deletions
This file was deleted.

x-pack/plugin/esql-core/src/main/java/org/elasticsearch/xpack/esql/core/expression/predicate/regex/RLikePattern.java

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,9 +21,10 @@ public RLikePattern(String regexpPattern) {
2121
}
2222

2323
@Override
24-
public Automaton createAutomaton() {
24+
public Automaton createAutomaton(boolean ignoreCase) {
25+
int matchFlags = ignoreCase ? RegExp.CASE_INSENSITIVE : 0;
2526
return Operations.determinize(
26-
new RegExp(regexpPattern, RegExp.ALL | RegExp.DEPRECATED_COMPLEMENT).toAutomaton(),
27+
new RegExp(regexpPattern, RegExp.ALL | RegExp.DEPRECATED_COMPLEMENT, matchFlags).toAutomaton(),
2728
Operations.DEFAULT_DETERMINIZE_WORK_LIMIT
2829
);
2930
}

x-pack/plugin/esql-core/src/main/java/org/elasticsearch/xpack/esql/core/expression/predicate/regex/WildcardLike.java

Lines changed: 0 additions & 35 deletions
This file was deleted.

x-pack/plugin/esql-core/src/main/java/org/elasticsearch/xpack/esql/core/expression/predicate/regex/WildcardPattern.java

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,13 @@
1010
import org.apache.lucene.search.WildcardQuery;
1111
import org.apache.lucene.util.automaton.Automaton;
1212
import org.apache.lucene.util.automaton.Operations;
13+
import org.apache.lucene.util.automaton.RegExp;
1314
import org.elasticsearch.xpack.esql.core.util.StringUtils;
1415

1516
import java.util.Objects;
1617

18+
import static org.elasticsearch.xpack.esql.core.util.StringUtils.luceneWildcardToRegExp;
19+
1720
/**
1821
* Similar to basic regex, supporting '?' wildcard for single character (same as regex ".")
1922
* and '*' wildcard for multiple characters (same as regex ".*")
@@ -37,8 +40,14 @@ public String pattern() {
3740
}
3841

3942
@Override
40-
public Automaton createAutomaton() {
41-
return WildcardQuery.toAutomaton(new Term(null, wildcard), Operations.DEFAULT_DETERMINIZE_WORK_LIMIT);
43+
public Automaton createAutomaton(boolean ignoreCase) {
44+
return ignoreCase
45+
? Operations.determinize(
46+
new RegExp(luceneWildcardToRegExp(wildcard), RegExp.ALL | RegExp.DEPRECATED_COMPLEMENT, RegExp.CASE_INSENSITIVE)
47+
.toAutomaton(),
48+
Operations.DEFAULT_DETERMINIZE_WORK_LIMIT
49+
)
50+
: WildcardQuery.toAutomaton(new Term(null, wildcard), Operations.DEFAULT_DETERMINIZE_WORK_LIMIT);
4251
}
4352

4453
@Override

x-pack/plugin/esql-core/src/main/java/org/elasticsearch/xpack/esql/core/util/StringUtils.java

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
package org.elasticsearch.xpack.esql.core.util;
88

99
import org.apache.lucene.document.InetAddressPoint;
10+
import org.apache.lucene.search.WildcardQuery;
1011
import org.apache.lucene.search.spell.LevenshteinDistance;
1112
import org.apache.lucene.util.BytesRef;
1213
import org.apache.lucene.util.CollectionUtil;
@@ -178,6 +179,44 @@ public static String wildcardToJavaPattern(String pattern, char escape) {
178179
return regex.toString();
179180
}
180181

182+
/**
183+
* Translates a Lucene wildcard pattern to a Lucene RegExp one.
184+
* @param wildcard Lucene wildcard pattern
185+
* @return Lucene RegExp pattern
186+
*/
187+
public static String luceneWildcardToRegExp(String wildcard) {
188+
StringBuilder regex = new StringBuilder();
189+
190+
for (int i = 0, wcLen = wildcard.length(); i < wcLen; i++) {
191+
char c = wildcard.charAt(i); // this will work chunking through Unicode as long as all values matched are ASCII
192+
switch (c) {
193+
case WildcardQuery.WILDCARD_STRING -> regex.append(".*");
194+
case WildcardQuery.WILDCARD_CHAR -> regex.append(".");
195+
case WildcardQuery.WILDCARD_ESCAPE -> {
196+
if (i + 1 < wcLen) {
197+
// consume the wildcard escaping, consider the next char
198+
char next = wildcard.charAt(i + 1);
199+
i++;
200+
switch (next) {
201+
case WildcardQuery.WILDCARD_STRING, WildcardQuery.WILDCARD_CHAR, WildcardQuery.WILDCARD_ESCAPE ->
202+
// escape `*`, `.`, `\`, since these are special chars in RegExp as well
203+
regex.append("\\");
204+
// default: unnecessary escaping -- just ignore the escaping
205+
}
206+
regex.append(next);
207+
} else {
208+
// "else fallthru, lenient parsing with a trailing \" -- according to WildcardQuery#toAutomaton
209+
regex.append("\\\\");
210+
}
211+
}
212+
case '$', '(', ')', '+', '.', '[', ']', '^', '{', '|', '}' -> regex.append("\\").append(c);
213+
default -> regex.append(c);
214+
}
215+
}
216+
217+
return regex.toString();
218+
}
219+
181220
/**
182221
* Translates a like pattern to a Lucene wildcard.
183222
* This methods pays attention to the custom escape char which gets converted into \ (used by Lucene).

x-pack/plugin/esql-core/src/test/java/org/elasticsearch/xpack/esql/core/util/StringUtilsTests.java

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,9 @@
99

1010
import org.elasticsearch.test.ESTestCase;
1111

12+
import static org.elasticsearch.xpack.esql.core.util.StringUtils.luceneWildcardToRegExp;
1213
import static org.elasticsearch.xpack.esql.core.util.StringUtils.wildcardToJavaPattern;
14+
import static org.hamcrest.Matchers.is;
1315

1416
public class StringUtilsTests extends ESTestCase {
1517

@@ -55,4 +57,21 @@ public void testWildcard() {
5557
public void testEscapedEscape() {
5658
assertEquals("^\\\\\\\\$", wildcardToJavaPattern("\\\\\\\\", '\\'));
5759
}
60+
61+
public void testLuceneWildcardToRegExp() {
62+
assertThat(luceneWildcardToRegExp(""), is(""));
63+
assertThat(luceneWildcardToRegExp("*"), is(".*"));
64+
assertThat(luceneWildcardToRegExp("?"), is("."));
65+
assertThat(luceneWildcardToRegExp("\\\\"), is("\\\\"));
66+
assertThat(luceneWildcardToRegExp("foo?bar"), is("foo.bar"));
67+
assertThat(luceneWildcardToRegExp("foo*bar"), is("foo.*bar"));
68+
assertThat(luceneWildcardToRegExp("foo\\\\bar"), is("foo\\\\bar"));
69+
assertThat(luceneWildcardToRegExp("foo*bar?baz"), is("foo.*bar.baz"));
70+
assertThat(luceneWildcardToRegExp("foo\\*bar"), is("foo\\*bar"));
71+
assertThat(luceneWildcardToRegExp("foo\\?bar\\?"), is("foo\\?bar\\?"));
72+
assertThat(luceneWildcardToRegExp("foo\\?bar\\"), is("foo\\?bar\\\\"));
73+
assertThat(luceneWildcardToRegExp("[](){}^$.|+"), is("\\[\\]\\(\\)\\{\\}\\^\\$\\.\\|\\+"));
74+
assertThat(luceneWildcardToRegExp("foo\\\uD83D\uDC14bar"), is("foo\uD83D\uDC14bar"));
75+
assertThat(luceneWildcardToRegExp("foo\uD83D\uDC14bar"), is("foo\uD83D\uDC14bar"));
76+
}
5877
}

0 commit comments

Comments
 (0)