Skip to content

Commit 34e08fb

Browse files
bpinteaelasticsearchmachine
andauthored
[8.19] ESQL: Pushdown constructs doing case-insensitive regexes (#128393) (#128750) (#128753) (#128919)
* ESQL: Pushdown constructs doing case-insensitive regexes (#128393) This introduces an optimization to pushdown to Lucense those language constructs that aim at case-insensitive regular expression matching, used with `LIKE` and `RLIKE` operators, such as: * `| WHERE TO_LOWER(field) LIKE "abc*"` * `| WHERE TO_UPPER(field) RLIKE "ABC.*"` These are now pushed as case-insensitive `wildcard` and `regexp` respectively queries down to Lucene. Closes #127479 (cherry picked from commit 0a80916) * ESQL: Fix conversion of a Lucene wildcard pattern to a regexp (#128750) This adds the reserved optional characters to the list that is escaped during conversion. These characters are all enabled by the `RegExp.ALL` flag in our use. Closes #128676, closes #128677. (cherry picked from commit 5eb54bf) * ESQL: Fix case-insensitive test generation with Unicodes (#128753) This excludes from testing the strings containing Unicode chars that change length when changing case. Closes #128705 Closes #128706 Closes #128710 Closes #128711 Closes Closes #128717 Closes #128789 Closes #128790 Closes #128791 Closes (cherry picked from commit 092d4ba) * [CI] Auto commit changes from spotless * Java21 adaptations and automerge fixes * [CI] Auto commit changes from spotless * 8.x's Lucene/RegExp doesn't support case-insensitive matching * [CI] Auto commit changes from spotless * One more Lucene 9 fix --------- Co-authored-by: elasticsearchmachine <[email protected]>
1 parent c5a3eb8 commit 34e08fb

File tree

35 files changed

+829
-239
lines changed

35 files changed

+829
-239
lines changed

benchmarks/src/main/java/org/elasticsearch/benchmark/compute/operator/EvalBenchmark.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,9 +45,9 @@
4545
import org.elasticsearch.xpack.esql.expression.function.scalar.math.RoundTo;
4646
import org.elasticsearch.xpack.esql.expression.function.scalar.multivalue.MvMin;
4747
import org.elasticsearch.xpack.esql.expression.function.scalar.nulls.Coalesce;
48-
import org.elasticsearch.xpack.esql.expression.function.scalar.string.RLike;
4948
import org.elasticsearch.xpack.esql.expression.function.scalar.string.ToLower;
5049
import org.elasticsearch.xpack.esql.expression.function.scalar.string.ToUpper;
50+
import org.elasticsearch.xpack.esql.expression.function.scalar.string.regex.RLike;
5151
import org.elasticsearch.xpack.esql.expression.predicate.operator.arithmetic.Add;
5252
import org.elasticsearch.xpack.esql.expression.predicate.operator.comparison.Equals;
5353
import org.elasticsearch.xpack.esql.expression.predicate.operator.comparison.LessThan;

docs/changelog/128393.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
pr: 128393
2+
summary: Pushdown constructs doing case-insensitive regexes
3+
area: ES|QL
4+
type: enhancement
5+
issues:
6+
- 127479

docs/changelog/128750.yaml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
pr: 128750
2+
summary: Fix conversion of a Lucene wildcard pattern to a regexp
3+
area: ES|QL
4+
type: bug
5+
issues:
6+
- 128677
7+
- 128676

server/src/main/java/org/elasticsearch/TransportVersions.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -234,6 +234,7 @@ static TransportVersion def(int id) {
234234
public static final TransportVersion DATA_STREAM_OPTIONS_API_REMOVE_INCLUDE_DEFAULTS_8_19 = def(8_841_0_41);
235235
public static final TransportVersion JOIN_ON_ALIASES_8_19 = def(8_841_0_42);
236236
public static final TransportVersion ILM_ADD_SKIP_SETTING_8_19 = def(8_841_0_43);
237+
public static final TransportVersion ESQL_REGEX_MATCH_WITH_CASE_INSENSITIVITY_8_19 = def(8_841_0_44);
237238
/*
238239
* STOP! READ THIS FIRST! No, really,
239240
* ____ _____ ___ ____ _ ____ _____ _ ____ _____ _ _ ___ ____ _____ ___ ____ ____ _____ _

x-pack/plugin/esql-core/src/main/java/org/elasticsearch/xpack/esql/core/expression/predicate/regex/AbstractStringPattern.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,11 +16,11 @@ public abstract class AbstractStringPattern implements StringPattern {
1616

1717
private Automaton automaton;
1818

19-
public abstract Automaton createAutomaton();
19+
public abstract Automaton createAutomaton(boolean ignoreCase);
2020

2121
private Automaton automaton() {
2222
if (automaton == null) {
23-
automaton = createAutomaton();
23+
automaton = createAutomaton(false);
2424
}
2525
return automaton;
2626
}

x-pack/plugin/esql-core/src/main/java/org/elasticsearch/xpack/esql/core/expression/predicate/regex/RLike.java

Lines changed: 0 additions & 35 deletions
This file was deleted.

x-pack/plugin/esql-core/src/main/java/org/elasticsearch/xpack/esql/core/expression/predicate/regex/RLikePattern.java

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
package org.elasticsearch.xpack.esql.core.expression.predicate.regex;
88

99
import org.apache.lucene.util.automaton.Automaton;
10+
import org.apache.lucene.util.automaton.Operations;
1011
import org.apache.lucene.util.automaton.RegExp;
1112

1213
import java.util.Objects;
@@ -20,8 +21,12 @@ public RLikePattern(String regexpPattern) {
2021
}
2122

2223
@Override
23-
public Automaton createAutomaton() {
24-
return new RegExp(regexpPattern).toAutomaton();
24+
public Automaton createAutomaton(boolean ignoreCase) {
25+
int matchFlags = ignoreCase ? RegExp.ASCII_CASE_INSENSITIVE : 0;
26+
return Operations.determinize(
27+
new RegExp(regexpPattern, RegExp.ALL, matchFlags).toAutomaton(),
28+
Operations.DEFAULT_DETERMINIZE_WORK_LIMIT
29+
);
2530
}
2631

2732
@Override

x-pack/plugin/esql-core/src/main/java/org/elasticsearch/xpack/esql/core/expression/predicate/regex/WildcardLike.java

Lines changed: 0 additions & 35 deletions
This file was deleted.

x-pack/plugin/esql-core/src/main/java/org/elasticsearch/xpack/esql/core/expression/predicate/regex/WildcardPattern.java

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,10 +11,13 @@
1111
import org.apache.lucene.util.automaton.Automaton;
1212
import org.apache.lucene.util.automaton.MinimizationOperations;
1313
import org.apache.lucene.util.automaton.Operations;
14+
import org.apache.lucene.util.automaton.RegExp;
1415
import org.elasticsearch.xpack.esql.core.util.StringUtils;
1516

1617
import java.util.Objects;
1718

19+
import static org.elasticsearch.xpack.esql.core.util.StringUtils.luceneWildcardToRegExp;
20+
1821
/**
1922
* Similar to basic regex, supporting '?' wildcard for single character (same as regex ".")
2023
* and '*' wildcard for multiple characters (same as regex ".*")
@@ -38,9 +41,16 @@ public String pattern() {
3841
}
3942

4043
@Override
41-
public Automaton createAutomaton() {
42-
Automaton automaton = WildcardQuery.toAutomaton(new Term(null, wildcard));
43-
return MinimizationOperations.minimize(automaton, Operations.DEFAULT_DETERMINIZE_WORK_LIMIT);
44+
public Automaton createAutomaton(boolean ignoreCase) {
45+
return ignoreCase
46+
? Operations.determinize(
47+
new RegExp(luceneWildcardToRegExp(wildcard), RegExp.ALL, RegExp.ASCII_CASE_INSENSITIVE).toAutomaton(),
48+
Operations.DEFAULT_DETERMINIZE_WORK_LIMIT
49+
)
50+
: MinimizationOperations.minimize(
51+
WildcardQuery.toAutomaton(new Term(null, wildcard)),
52+
Operations.DEFAULT_DETERMINIZE_WORK_LIMIT
53+
);
4454
}
4555

4656
@Override

x-pack/plugin/esql-core/src/main/java/org/elasticsearch/xpack/esql/core/util/StringUtils.java

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
package org.elasticsearch.xpack.esql.core.util;
88

99
import org.apache.lucene.document.InetAddressPoint;
10+
import org.apache.lucene.search.WildcardQuery;
1011
import org.apache.lucene.search.spell.LevenshteinDistance;
1112
import org.apache.lucene.util.BytesRef;
1213
import org.apache.lucene.util.CollectionUtil;
@@ -178,6 +179,48 @@ public static String wildcardToJavaPattern(String pattern, char escape) {
178179
return regex.toString();
179180
}
180181

182+
/**
183+
* Translates a Lucene wildcard pattern to a Lucene RegExp one.
184+
* Note: all RegExp "optional" characters are escaped too (allowing the use of the {@code RegExp.ALL} flag).
185+
* @param wildcard Lucene wildcard pattern
186+
* @return Lucene RegExp pattern
187+
*/
188+
public static String luceneWildcardToRegExp(String wildcard) {
189+
StringBuilder regex = new StringBuilder();
190+
191+
for (int i = 0, wcLen = wildcard.length(); i < wcLen; i++) {
192+
char c = wildcard.charAt(i); // this will work chunking through Unicode as long as all values matched are ASCII
193+
switch (c) {
194+
case WildcardQuery.WILDCARD_STRING -> regex.append(".*");
195+
case WildcardQuery.WILDCARD_CHAR -> regex.append(".");
196+
case WildcardQuery.WILDCARD_ESCAPE -> {
197+
if (i + 1 < wcLen) {
198+
// consume the wildcard escaping, consider the next char
199+
char next = wildcard.charAt(i + 1);
200+
i++;
201+
switch (next) {
202+
case WildcardQuery.WILDCARD_STRING, WildcardQuery.WILDCARD_CHAR, WildcardQuery.WILDCARD_ESCAPE ->
203+
// escape `*`, `.`, `\`, since these are special chars in RegExp as well
204+
regex.append("\\");
205+
// default: unnecessary escaping -- just ignore the escaping
206+
}
207+
regex.append(next);
208+
} else {
209+
// "else fallthru, lenient parsing with a trailing \" -- according to WildcardQuery#toAutomaton
210+
regex.append("\\\\");
211+
}
212+
}
213+
// reserved RegExp characters
214+
case '"', '$', '(', ')', '+', '.', '[', ']', '^', '{', '|', '}' -> regex.append("\\").append(c);
215+
// reserved optional RegExp characters
216+
case '#', '&', '<', '>' -> regex.append("\\").append(c);
217+
default -> regex.append(c);
218+
}
219+
}
220+
221+
return regex.toString();
222+
}
223+
181224
/**
182225
* Translates a like pattern to a Lucene wildcard.
183226
* This methods pays attention to the custom escape char which gets converted into \ (used by Lucene).

0 commit comments

Comments
 (0)