Skip to content

Commit caf2c94

Browse files
committed
Pushdown constructs doing case-insensitive regexes
This introduces an optimization to pushdown to Lucense those language constructs that aim case-insensitive regular expression matching, used with LIKE and RLIKE operators, such as: * `| WHERE TO_LOWER(field) LIKE "abc*"` * `| WHERE TO_UPPER(field) RLIKE `ABC.*` These are now pushed as case-insensitive `regexp` and `wildcard` respectively queries down to Lucene.
1 parent c96a99a commit caf2c94

File tree

13 files changed

+439
-19
lines changed

13 files changed

+439
-19
lines changed

server/src/main/java/org/elasticsearch/TransportVersions.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -263,6 +263,7 @@ static TransportVersion def(int id) {
263263
public static final TransportVersion ML_INFERENCE_HUGGING_FACE_CHAT_COMPLETION_ADDED = def(9_078_0_00);
264264
public static final TransportVersion NODES_STATS_SUPPORTS_MULTI_PROJECT = def(9_079_0_00);
265265
public static final TransportVersion ML_INFERENCE_HUGGING_FACE_RERANK_ADDED = def(9_080_0_00);
266+
public static final TransportVersion ESQL_REGEX_MATCH_WITH_CASE_INSENSITIVITY = def(9_081_0_00);
266267
/*
267268
* STOP! READ THIS FIRST! No, really,
268269
* ____ _____ ___ ____ _ ____ _____ _ ____ _____ _ _ ___ ____ _____ ___ ____ ____ _____ _

x-pack/plugin/esql-core/src/main/java/org/elasticsearch/xpack/esql/core/expression/predicate/regex/AbstractStringPattern.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,11 +16,11 @@ public abstract class AbstractStringPattern implements StringPattern {
1616

1717
private Automaton automaton;
1818

19-
public abstract Automaton createAutomaton();
19+
public abstract Automaton createAutomaton(boolean ignoreCase);
2020

2121
private Automaton automaton() {
2222
if (automaton == null) {
23-
automaton = createAutomaton();
23+
automaton = createAutomaton(false);
2424
}
2525
return automaton;
2626
}

x-pack/plugin/esql-core/src/main/java/org/elasticsearch/xpack/esql/core/expression/predicate/regex/RLikePattern.java

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,9 +21,10 @@ public RLikePattern(String regexpPattern) {
2121
}
2222

2323
@Override
24-
public Automaton createAutomaton() {
24+
public Automaton createAutomaton(boolean ignoreCase) {
25+
int matchFlags = ignoreCase ? RegExp.CASE_INSENSITIVE : 0;
2526
return Operations.determinize(
26-
new RegExp(regexpPattern, RegExp.ALL | RegExp.DEPRECATED_COMPLEMENT).toAutomaton(),
27+
new RegExp(regexpPattern, RegExp.ALL | RegExp.DEPRECATED_COMPLEMENT, matchFlags).toAutomaton(),
2728
Operations.DEFAULT_DETERMINIZE_WORK_LIMIT
2829
);
2930
}

x-pack/plugin/esql-core/src/main/java/org/elasticsearch/xpack/esql/core/expression/predicate/regex/WildcardPattern.java

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,13 @@
1010
import org.apache.lucene.search.WildcardQuery;
1111
import org.apache.lucene.util.automaton.Automaton;
1212
import org.apache.lucene.util.automaton.Operations;
13+
import org.apache.lucene.util.automaton.RegExp;
1314
import org.elasticsearch.xpack.esql.core.util.StringUtils;
1415

1516
import java.util.Objects;
1617

18+
import static org.elasticsearch.xpack.esql.core.util.StringUtils.luceneWildcardToRegExp;
19+
1720
/**
1821
* Similar to basic regex, supporting '?' wildcard for single character (same as regex ".")
1922
* and '*' wildcard for multiple characters (same as regex ".*")
@@ -37,8 +40,14 @@ public String pattern() {
3740
}
3841

3942
@Override
40-
public Automaton createAutomaton() {
41-
return WildcardQuery.toAutomaton(new Term(null, wildcard), Operations.DEFAULT_DETERMINIZE_WORK_LIMIT);
43+
public Automaton createAutomaton(boolean ignoreCase) {
44+
return ignoreCase
45+
? Operations.determinize(
46+
new RegExp(luceneWildcardToRegExp(wildcard), RegExp.ALL | RegExp.DEPRECATED_COMPLEMENT, RegExp.CASE_INSENSITIVE)
47+
.toAutomaton(),
48+
Operations.DEFAULT_DETERMINIZE_WORK_LIMIT
49+
)
50+
: WildcardQuery.toAutomaton(new Term(null, wildcard), Operations.DEFAULT_DETERMINIZE_WORK_LIMIT);
4251
}
4352

4453
@Override

x-pack/plugin/esql-core/src/main/java/org/elasticsearch/xpack/esql/core/util/StringUtils.java

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
package org.elasticsearch.xpack.esql.core.util;
88

99
import org.apache.lucene.document.InetAddressPoint;
10+
import org.apache.lucene.search.WildcardQuery;
1011
import org.apache.lucene.search.spell.LevenshteinDistance;
1112
import org.apache.lucene.util.BytesRef;
1213
import org.apache.lucene.util.CollectionUtil;
@@ -178,6 +179,44 @@ public static String wildcardToJavaPattern(String pattern, char escape) {
178179
return regex.toString();
179180
}
180181

182+
/**
183+
* Translates a Lucene wildcard pattern to a Lucene RegExp one.
184+
* @param wildcard Lucene wildcard pattern
185+
* @return Lucene RegExp pattern
186+
*/
187+
public static String luceneWildcardToRegExp(String wildcard) {
188+
StringBuilder regex = new StringBuilder();
189+
190+
for (int i = 0, wcLen = wildcard.length(); i < wcLen; i++) {
191+
char c = wildcard.charAt(i); // this will work chunking through Unicode as long as all values matched are ASCII
192+
switch (c) {
193+
case WildcardQuery.WILDCARD_STRING -> regex.append(".*");
194+
case WildcardQuery.WILDCARD_CHAR -> regex.append(".");
195+
case WildcardQuery.WILDCARD_ESCAPE -> {
196+
if (i + 1 < wcLen) {
197+
// consume the wildcard escaping, consider the next char
198+
char next = wildcard.charAt(i + 1);
199+
i++;
200+
switch (next) {
201+
case WildcardQuery.WILDCARD_STRING, WildcardQuery.WILDCARD_CHAR, WildcardQuery.WILDCARD_ESCAPE ->
202+
// escape `*`, `.`, `\`, since these are special chars in RegExp as well
203+
regex.append("\\");
204+
// default: unnecessary escaping -- just ignore the escaping
205+
}
206+
regex.append(next);
207+
} else {
208+
// "else fallthru, lenient parsing with a trailing \" -- according to WildcardQuery#toAutomaton
209+
regex.append("\\\\");
210+
}
211+
}
212+
case '$', '(', ')', '+', '.', '[', ']', '^', '{', '|', '}' -> regex.append("\\").append(c);
213+
default -> regex.append(c);
214+
}
215+
}
216+
217+
return regex.toString();
218+
}
219+
181220
/**
182221
* Translates a like pattern to a Lucene wildcard.
183222
* This methods pays attention to the custom escape char which gets converted into \ (used by Lucene).

x-pack/plugin/esql-core/src/test/java/org/elasticsearch/xpack/esql/core/util/StringUtilsTests.java

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,11 @@
99

1010
import org.elasticsearch.test.ESTestCase;
1111

12+
import static org.elasticsearch.xpack.esql.core.util.StringUtils.luceneWildcardToRegExp;
1213
import static org.elasticsearch.xpack.esql.core.util.StringUtils.wildcardToJavaPattern;
1314

15+
import static org.hamcrest.Matchers.is;
16+
1417
public class StringUtilsTests extends ESTestCase {
1518

1619
public void testNoWildcard() {
@@ -55,4 +58,21 @@ public void testWildcard() {
5558
public void testEscapedEscape() {
5659
assertEquals("^\\\\\\\\$", wildcardToJavaPattern("\\\\\\\\", '\\'));
5760
}
61+
62+
public void testLuceneWildcardToRegExp() {
63+
assertThat(luceneWildcardToRegExp(""), is(""));
64+
assertThat(luceneWildcardToRegExp("*"), is(".*"));
65+
assertThat(luceneWildcardToRegExp("?"), is("."));
66+
assertThat(luceneWildcardToRegExp("\\\\"), is("\\\\"));
67+
assertThat(luceneWildcardToRegExp("foo?bar"), is("foo.bar"));
68+
assertThat(luceneWildcardToRegExp("foo*bar"), is("foo.*bar"));
69+
assertThat(luceneWildcardToRegExp("foo\\\\bar"), is("foo\\\\bar"));
70+
assertThat(luceneWildcardToRegExp("foo*bar?baz"), is("foo.*bar.baz"));
71+
assertThat(luceneWildcardToRegExp("foo\\*bar"), is("foo\\*bar"));
72+
assertThat(luceneWildcardToRegExp("foo\\?bar\\?"), is("foo\\?bar\\?"));
73+
assertThat(luceneWildcardToRegExp("foo\\?bar\\"), is("foo\\?bar\\\\"));
74+
assertThat(luceneWildcardToRegExp("[](){}^$.|+"), is("\\[\\]\\(\\)\\{\\}\\^\\$\\.\\|\\+"));
75+
assertThat(luceneWildcardToRegExp("foo\\\uD83D\uDC14bar"), is("foo\uD83D\uDC14bar"));
76+
assertThat(luceneWildcardToRegExp("foo\uD83D\uDC14bar"), is("foo\uD83D\uDC14bar"));
77+
}
5878
}

x-pack/plugin/esql/qa/testFixtures/src/main/resources/where-like.csv-spec

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -319,3 +319,63 @@ warningRegex:java.lang.IllegalArgumentException: single-value function encounter
319319
emp_no:integer | job_positions:keyword
320320
10025 | Accountant
321321
;
322+
323+
likeWithUpper
324+
FROM employees
325+
| KEEP emp_no, first_name
326+
| SORT emp_no
327+
| WHERE TO_UPPER(first_name) LIKE "GEOR*"
328+
;
329+
330+
emp_no:integer |first_name:keyword
331+
10001 |Georgi
332+
10055 |Georgy
333+
;
334+
335+
likeWithLower
336+
FROM employees
337+
| KEEP emp_no, first_name
338+
| SORT emp_no
339+
| WHERE TO_LOWER(TO_UPPER(first_name)) LIKE "geor*"
340+
;
341+
342+
emp_no:integer |first_name:keyword
343+
10001 |Georgi
344+
10055 |Georgy
345+
;
346+
347+
rlikeWithUpper
348+
FROM employees
349+
| KEEP emp_no, first_name
350+
| SORT emp_no
351+
| WHERE TO_UPPER(first_name) RLIKE "GEOR.*"
352+
;
353+
354+
emp_no:integer |first_name:keyword
355+
10001 |Georgi
356+
10055 |Georgy
357+
;
358+
359+
rlikeWithLower
360+
FROM employees
361+
| KEEP emp_no, first_name
362+
| SORT emp_no
363+
| WHERE TO_LOWER(TO_UPPER(first_name)) RLIKE "geor.*"
364+
;
365+
366+
emp_no:integer |first_name:keyword
367+
10001 |Georgi
368+
10055 |Georgy
369+
;
370+
371+
negatedRLikeWithLower
372+
FROM employees
373+
| KEEP emp_no, first_name
374+
| SORT emp_no
375+
| WHERE TO_LOWER(TO_UPPER(first_name)) NOT RLIKE "geor.*"
376+
| STATS c = COUNT()
377+
;
378+
379+
c:long
380+
88
381+
;

x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/RLike.java

Lines changed: 27 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,12 @@
77

88
package org.elasticsearch.xpack.esql.expression.function.scalar.string;
99

10+
import org.elasticsearch.TransportVersions;
1011
import org.elasticsearch.common.io.stream.NamedWriteableRegistry;
1112
import org.elasticsearch.common.io.stream.StreamInput;
1213
import org.elasticsearch.common.io.stream.StreamOutput;
1314
import org.elasticsearch.compute.operator.EvalOperator;
15+
import org.elasticsearch.xpack.esql.EsqlIllegalArgumentException;
1416
import org.elasticsearch.xpack.esql.capabilities.TranslationAware;
1517
import org.elasticsearch.xpack.esql.core.expression.Expression;
1618
import org.elasticsearch.xpack.esql.core.expression.FoldContext;
@@ -37,6 +39,7 @@ public class RLike extends org.elasticsearch.xpack.esql.core.expression.predicat
3739
EvaluatorMapper,
3840
TranslationAware.SingleValueTranslationAware {
3941
public static final NamedWriteableRegistry.Entry ENTRY = new NamedWriteableRegistry.Entry(Expression.class, "RLike", RLike::new);
42+
public static final String NAME = "RLIKE";
4043

4144
@FunctionInfo(returnType = "boolean", description = """
4245
Use `RLIKE` to filter data based on string patterns using using
@@ -52,28 +55,43 @@ Matching special characters (eg. `.`, `*`, `(`...) will require escaping.
5255
To reduce the overhead of escaping, we suggest using triple quotes strings `\"\"\"`
5356
5457
<<load-esql-example, file=string tag=rlikeEscapingTripleQuotes>>
55-
""", operator = "RLIKE", examples = @Example(file = "docs", tag = "rlike"))
58+
""", operator = NAME, examples = @Example(file = "docs", tag = "rlike"))
5659
public RLike(
5760
Source source,
5861
@Param(name = "str", type = { "keyword", "text" }, description = "A literal value.") Expression value,
5962
@Param(name = "pattern", type = { "keyword", "text" }, description = "A regular expression.") RLikePattern pattern
6063
) {
61-
super(source, value, pattern);
64+
this(source, value, pattern, false);
6265
}
6366

6467
public RLike(Source source, Expression field, RLikePattern rLikePattern, boolean caseInsensitive) {
6568
super(source, field, rLikePattern, caseInsensitive);
6669
}
6770

6871
private RLike(StreamInput in) throws IOException {
69-
this(Source.readFrom((PlanStreamInput) in), in.readNamedWriteable(Expression.class), new RLikePattern(in.readString()));
72+
this(
73+
Source.readFrom((PlanStreamInput) in),
74+
in.readNamedWriteable(Expression.class),
75+
new RLikePattern(in.readString()),
76+
in.getTransportVersion().onOrAfter(TransportVersions.ESQL_REGEX_MATCH_WITH_CASE_INSENSITIVITY) && in.readBoolean()
77+
);
7078
}
7179

7280
@Override
7381
public void writeTo(StreamOutput out) throws IOException {
7482
source().writeTo(out);
7583
out.writeNamedWriteable(field());
7684
out.writeString(pattern().asJavaRegex());
85+
if (caseInsensitive() && out.getTransportVersion().before(TransportVersions.ESQL_REGEX_MATCH_WITH_CASE_INSENSITIVITY)) {
86+
// The plan has been optimized to run a case-insensitive match, which the remote peer cannot be notified of. Simply avoiding
87+
// the serialization of the boolean would result in wrong results.
88+
throw new EsqlIllegalArgumentException(
89+
NAME + " with case insensitivity is not supported in peer node's version [{}]. Upgrade to version [{}] or newer.",
90+
out.getTransportVersion(),
91+
TransportVersions.ESQL_REGEX_MATCH_WITH_CASE_INSENSITIVITY
92+
);
93+
}
94+
out.writeBoolean(caseInsensitive());
7795
}
7896

7997
@Override
@@ -103,7 +121,7 @@ public Boolean fold(FoldContext ctx) {
103121

104122
@Override
105123
public EvalOperator.ExpressionEvaluator.Factory toEvaluator(ToEvaluator toEvaluator) {
106-
return AutomataMatch.toEvaluator(source(), toEvaluator.apply(field()), pattern().createAutomaton());
124+
return AutomataMatch.toEvaluator(source(), toEvaluator.apply(field()), pattern().createAutomaton(caseInsensitive()));
107125
}
108126

109127
@Override
@@ -122,4 +140,9 @@ public Query asQuery(LucenePushdownPredicates pushdownPredicates, TranslatorHand
122140
public Expression singleValueField() {
123141
return field();
124142
}
143+
144+
@Override
145+
public String nodeString() {
146+
return NAME + "(" + field().nodeString() + ", \"" + pattern().pattern() + "\", " + caseInsensitive() + ")";
147+
}
125148
}

x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/WildcardLike.java

Lines changed: 32 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,12 @@
88
package org.elasticsearch.xpack.esql.expression.function.scalar.string;
99

1010
import org.apache.lucene.util.automaton.Automata;
11+
import org.elasticsearch.TransportVersions;
1112
import org.elasticsearch.common.io.stream.NamedWriteableRegistry;
1213
import org.elasticsearch.common.io.stream.StreamInput;
1314
import org.elasticsearch.common.io.stream.StreamOutput;
1415
import org.elasticsearch.compute.operator.EvalOperator;
16+
import org.elasticsearch.xpack.esql.EsqlIllegalArgumentException;
1517
import org.elasticsearch.xpack.esql.capabilities.TranslationAware;
1618
import org.elasticsearch.xpack.esql.core.expression.Expression;
1719
import org.elasticsearch.xpack.esql.core.expression.FieldAttribute;
@@ -43,6 +45,7 @@ public class WildcardLike extends org.elasticsearch.xpack.esql.core.expression.p
4345
"WildcardLike",
4446
WildcardLike::new
4547
);
48+
public static final String NAME = "LIKE";
4649

4750
@FunctionInfo(returnType = "boolean", description = """
4851
Use `LIKE` to filter data based on string patterns using wildcards. `LIKE`
@@ -63,24 +66,43 @@ also act on a constant (literal) expression. The right-hand side of the operator
6366
To reduce the overhead of escaping, we suggest using triple quotes strings `\"\"\"`
6467
6568
<<load-esql-example, file=string tag=likeEscapingTripleQuotes>>
66-
""", operator = "LIKE", examples = @Example(file = "docs", tag = "like"))
69+
""", operator = NAME, examples = @Example(file = "docs", tag = "like"))
6770
public WildcardLike(
6871
Source source,
6972
@Param(name = "str", type = { "keyword", "text" }, description = "A literal expression.") Expression left,
7073
@Param(name = "pattern", type = { "keyword", "text" }, description = "Pattern.") WildcardPattern pattern
7174
) {
72-
super(source, left, pattern, false);
75+
this(source, left, pattern, false);
76+
}
77+
78+
public WildcardLike(Source source, Expression left, WildcardPattern pattern, boolean caseInsensitive) {
79+
super(source, left, pattern, caseInsensitive);
7380
}
7481

7582
private WildcardLike(StreamInput in) throws IOException {
76-
this(Source.readFrom((PlanStreamInput) in), in.readNamedWriteable(Expression.class), new WildcardPattern(in.readString()));
83+
this(
84+
Source.readFrom((PlanStreamInput) in),
85+
in.readNamedWriteable(Expression.class),
86+
new WildcardPattern(in.readString()),
87+
in.getTransportVersion().onOrAfter(TransportVersions.ESQL_REGEX_MATCH_WITH_CASE_INSENSITIVITY) && in.readBoolean()
88+
);
7789
}
7890

7991
@Override
8092
public void writeTo(StreamOutput out) throws IOException {
8193
source().writeTo(out);
8294
out.writeNamedWriteable(field());
8395
out.writeString(pattern().pattern());
96+
if (caseInsensitive() && out.getTransportVersion().before(TransportVersions.ESQL_REGEX_MATCH_WITH_CASE_INSENSITIVITY)) {
97+
// The plan has been optimized to run a case-insensitive match, which the remote peer cannot be notified of. Simply avoiding
98+
// the serialization of the boolean would result in wrong results.
99+
throw new EsqlIllegalArgumentException(
100+
NAME + " with case insensitivity is not supported in peer node's version [{}]. Upgrade to version [{}] or newer.",
101+
out.getTransportVersion(),
102+
TransportVersions.ESQL_REGEX_MATCH_WITH_CASE_INSENSITIVITY
103+
);
104+
}
105+
out.writeBoolean(caseInsensitive());
84106
}
85107

86108
@Override
@@ -90,7 +112,7 @@ public String getWriteableName() {
90112

91113
@Override
92114
protected NodeInfo<org.elasticsearch.xpack.esql.core.expression.predicate.regex.WildcardLike> info() {
93-
return NodeInfo.create(this, WildcardLike::new, field(), pattern());
115+
return NodeInfo.create(this, WildcardLike::new, field(), pattern(), caseInsensitive());
94116
}
95117

96118
@Override
@@ -114,7 +136,7 @@ public EvalOperator.ExpressionEvaluator.Factory toEvaluator(ToEvaluator toEvalua
114136
source(),
115137
toEvaluator.apply(field()),
116138
// The empty pattern will accept the empty string
117-
pattern().pattern().length() == 0 ? Automata.makeEmptyString() : pattern().createAutomaton()
139+
pattern().pattern().length() == 0 ? Automata.makeEmptyString() : pattern().createAutomaton(caseInsensitive())
118140
);
119141
}
120142

@@ -139,4 +161,9 @@ private Query translateField(String targetFieldName) {
139161
public Expression singleValueField() {
140162
return field();
141163
}
164+
165+
@Override
166+
public String nodeString() {
167+
return NAME + "(" + field().nodeString() + ", \"" + pattern().pattern() + "\", " + caseInsensitive() + ")";
168+
}
142169
}

0 commit comments

Comments
 (0)