Skip to content

Commit 090c103

Browse files
authored
Delay automaton creation in BinaryDvConfirmedQuery to avoid OOM on queries against WildCard fields (#136086)
1 parent 4d2bff7 commit 090c103

File tree

4 files changed

+166
-77
lines changed

4 files changed

+166
-77
lines changed

docs/changelog/136086.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
pr: 136086
2+
summary: Delay automaton creation in `BinaryDvConfirmedQuery` to avoid OOM on queries
3+
against `WildCard` fields
4+
area: Search
5+
type: bug
6+
issues: []

x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/BinaryDvConfirmedQuery.java

Lines changed: 150 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -10,21 +10,28 @@
1010
import org.apache.lucene.index.BinaryDocValues;
1111
import org.apache.lucene.index.DocValues;
1212
import org.apache.lucene.index.LeafReaderContext;
13+
import org.apache.lucene.index.Term;
1314
import org.apache.lucene.search.ConstantScoreScorer;
1415
import org.apache.lucene.search.ConstantScoreWeight;
1516
import org.apache.lucene.search.DocIdSetIterator;
17+
import org.apache.lucene.search.FuzzyQuery;
1618
import org.apache.lucene.search.IndexSearcher;
1719
import org.apache.lucene.search.Query;
1820
import org.apache.lucene.search.QueryVisitor;
1921
import org.apache.lucene.search.ScoreMode;
2022
import org.apache.lucene.search.Scorer;
2123
import org.apache.lucene.search.ScorerSupplier;
24+
import org.apache.lucene.search.TermRangeQuery;
2225
import org.apache.lucene.search.TwoPhaseIterator;
2326
import org.apache.lucene.search.Weight;
27+
import org.apache.lucene.search.WildcardQuery;
2428
import org.apache.lucene.util.BytesRef;
2529
import org.apache.lucene.util.automaton.Automaton;
2630
import org.apache.lucene.util.automaton.ByteRunAutomaton;
31+
import org.apache.lucene.util.automaton.Operations;
32+
import org.apache.lucene.util.automaton.RegExp;
2733
import org.elasticsearch.common.io.stream.ByteArrayStreamInput;
34+
import org.elasticsearch.common.lucene.search.AutomatonQueries;
2835

2936
import java.io.IOException;
3037
import java.util.Arrays;
@@ -46,15 +53,65 @@ private BinaryDvConfirmedQuery(Query approximation, String field) {
4653
}
4754

4855
/**
49-
* Returns a query that runs the provided Automaton across all binary doc values (but only for docs that also
50-
* match a provided approximation query which is key to getting good performance).
56+
* Returns a query that runs the generated Automaton from a range query across
57+
* all binary doc values (but only for docs that also match a provided approximation query which is key
58+
* to getting good performance).
5159
*/
52-
public static Query fromAutomaton(Query approximation, String field, String matchPattern, Automaton automaton) {
53-
return new BinaryDvConfirmedAutomatonQuery(approximation, field, matchPattern, automaton);
60+
public static Query fromRangeQuery(
61+
Query approximation,
62+
String field,
63+
BytesRef lower,
64+
BytesRef upper,
65+
boolean includeLower,
66+
boolean includeUpper
67+
) {
68+
return new BinaryDvConfirmedAutomatonQuery(
69+
approximation,
70+
field,
71+
new RangeAutomatonProvider(lower, upper, includeLower, includeUpper)
72+
);
5473
}
5574

5675
/**
57-
* Returns a query that checks for equality of at leat one of the provided terms across
76+
* Returns a query that runs the generated Automaton from a wildcard query across
77+
* all binary doc values (but only for docs that also match a provided approximation query which is key
78+
* to getting good performance).
79+
*/
80+
public static Query fromWildcardQuery(Query approximation, String field, String matchPattern, boolean caseInsensitive) {
81+
return new BinaryDvConfirmedAutomatonQuery(approximation, field, new PatternAutomatonProvider(matchPattern, caseInsensitive));
82+
}
83+
84+
/**
85+
* Returns a query that runs the generated Automaton from a regexp query across
86+
* all binary doc values (but only for docs that also match a provided approximation query which is key
87+
* to getting good performance).
88+
*/
89+
public static Query fromRegexpQuery(
90+
Query approximation,
91+
String field,
92+
String value,
93+
int syntaxFlags,
94+
int matchFlags,
95+
int maxDeterminizedStates
96+
) {
97+
return new BinaryDvConfirmedAutomatonQuery(
98+
approximation,
99+
field,
100+
new RegexAutomatonProvider(value, syntaxFlags, matchFlags, maxDeterminizedStates)
101+
);
102+
}
103+
104+
/**
105+
* Returns a query that runs the generated Automaton from a fuzzy query across
106+
* all binary doc values (but only for docs that also match a provided approximation query which is key
107+
* to getting good performance).
108+
*/
109+
public static Query fromFuzzyQuery(Query approximation, String field, String searchTerm, FuzzyQuery fuzzyQuery) {
110+
return new BinaryDvConfirmedAutomatonQuery(approximation, field, new FuzzyQueryAutomatonProvider(searchTerm, fuzzyQuery));
111+
}
112+
113+
/**
114+
* Returns a query that checks for equality of at least one of the provided terms across
58115
* all binary doc values (but only for docs that also match a provided approximation query which
59116
* is key to getting good performance).
60117
*/
@@ -63,7 +120,7 @@ public static Query fromTerms(Query approximation, String field, BytesRef... ter
63120
return new BinaryDvConfirmedTermsQuery(approximation, field, terms);
64121
}
65122

66-
protected abstract boolean matchesBinaryDV(ByteArrayStreamInput bytes, BytesRef bytesRef, BytesRef scratch) throws IOException;
123+
protected abstract BinaryDVMatcher getBinaryDVMatcher();
67124

68125
protected abstract Query rewrite(Query approxRewrite) throws IOException;
69126

@@ -79,7 +136,7 @@ public Query rewrite(IndexSearcher searcher) throws IOException {
79136
@Override
80137
public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException {
81138
final Weight approxWeight = approxQuery.createWeight(searcher, scoreMode, boost);
82-
139+
final BinaryDVMatcher matcher = getBinaryDVMatcher();
83140
return new ConstantScoreWeight(this, boost) {
84141

85142
@Override
@@ -106,7 +163,7 @@ public boolean matches() throws IOException {
106163
}
107164
final BytesRef bytesRef = values.binaryValue();
108165
bytes.reset(bytesRef.bytes, bytesRef.offset, bytesRef.length);
109-
return matchesBinaryDV(bytes, bytesRef, scratch);
166+
return matcher.matchesBinaryDV(bytes, bytesRef, scratch);
110167
}
111168

112169
@Override
@@ -157,55 +214,56 @@ public void visit(QueryVisitor visitor) {
157214
}
158215
}
159216

160-
private static class BinaryDvConfirmedAutomatonQuery extends BinaryDvConfirmedQuery {
217+
interface BinaryDVMatcher {
218+
boolean matchesBinaryDV(ByteArrayStreamInput bytes, BytesRef bytesRef, BytesRef scratch) throws IOException;
219+
}
161220

162-
private final ByteRunAutomaton byteRunAutomaton;
163-
private final String matchPattern;
221+
private static class BinaryDvConfirmedAutomatonQuery extends BinaryDvConfirmedQuery {
164222

165-
private BinaryDvConfirmedAutomatonQuery(Query approximation, String field, String matchPattern, Automaton automaton) {
166-
this(approximation, field, matchPattern, new ByteRunAutomaton(automaton));
167-
}
223+
private final AutomatonProvider automatonProvider;
168224

169-
private BinaryDvConfirmedAutomatonQuery(Query approximation, String field, String matchPattern, ByteRunAutomaton byteRunAutomaton) {
225+
private BinaryDvConfirmedAutomatonQuery(Query approximation, String field, AutomatonProvider automatonProvider) {
170226
super(approximation, field);
171-
this.matchPattern = matchPattern;
172-
this.byteRunAutomaton = byteRunAutomaton;
227+
this.automatonProvider = automatonProvider;
173228
}
174229

175230
@Override
176-
protected boolean matchesBinaryDV(ByteArrayStreamInput bytes, BytesRef bytesRef, BytesRef scratch) throws IOException {
177-
int size = bytes.readVInt();
178-
for (int i = 0; i < size; i++) {
179-
int valLength = bytes.readVInt();
180-
if (byteRunAutomaton.run(bytesRef.bytes, bytes.getPosition(), valLength)) {
181-
return true;
231+
protected BinaryDVMatcher getBinaryDVMatcher() {
232+
final ByteRunAutomaton byteRunAutomaton = new ByteRunAutomaton(automatonProvider.getAutomaton(field));
233+
return (bytes, bytesRef, scratch) -> {
234+
final int size = bytes.readVInt();
235+
for (int i = 0; i < size; i++) {
236+
final int valLength = bytes.readVInt();
237+
if (byteRunAutomaton.run(bytesRef.bytes, bytes.getPosition(), valLength)) {
238+
return true;
239+
}
240+
bytes.skipBytes(valLength);
182241
}
183-
bytes.skipBytes(valLength);
184-
}
185-
return false;
242+
return false;
243+
};
186244
}
187245

188246
@Override
189247
protected Query rewrite(Query approxRewrite) {
190-
return new BinaryDvConfirmedAutomatonQuery(approxRewrite, field, matchPattern, byteRunAutomaton);
248+
return new BinaryDvConfirmedAutomatonQuery(approxRewrite, field, automatonProvider);
191249
}
192250

193251
@Override
194252
public String toString(String field) {
195-
return field + ":" + matchPattern;
253+
return field + ":" + automatonProvider.toString();
196254
}
197255

198256
@Override
199257
public boolean equals(Object o) {
200258
if (o == null || getClass() != o.getClass()) return false;
201259
if (super.equals(o) == false) return false;
202260
BinaryDvConfirmedAutomatonQuery other = (BinaryDvConfirmedAutomatonQuery) o;
203-
return Objects.equals(byteRunAutomaton, other.byteRunAutomaton) && Objects.equals(matchPattern, other.matchPattern);
261+
return Objects.equals(automatonProvider, other.automatonProvider);
204262
}
205263

206264
@Override
207265
public int hashCode() {
208-
return Objects.hash(super.hashCode(), matchPattern, byteRunAutomaton);
266+
return Objects.hash(super.hashCode(), automatonProvider);
209267
}
210268
}
211269

@@ -220,28 +278,31 @@ private BinaryDvConfirmedTermsQuery(Query approximation, String field, BytesRef[
220278
}
221279

222280
@Override
223-
protected boolean matchesBinaryDV(ByteArrayStreamInput bytes, BytesRef bytesRef, BytesRef scratch) throws IOException {
224-
scratch.bytes = bytesRef.bytes;
225-
final int size = bytes.readVInt();
226-
for (int i = 0; i < size; i++) {
227-
final int valLength = bytes.readVInt();
228-
scratch.offset = bytes.getPosition();
229-
scratch.length = valLength;
230-
if (terms.length == 1) {
231-
if (terms[0].bytesEquals(scratch)) {
232-
return true;
233-
}
234-
} else {
235-
final int pos = Arrays.binarySearch(terms, scratch, BytesRef::compareTo);
236-
if (pos >= 0) {
237-
assert terms[pos].bytesEquals(scratch) : "Expected term at position " + pos + " to match scratch, but it did not.";
238-
return true;
281+
protected BinaryDVMatcher getBinaryDVMatcher() {
282+
return (bytes, bytesRef, scratch) -> {
283+
scratch.bytes = bytesRef.bytes;
284+
final int size = bytes.readVInt();
285+
for (int i = 0; i < size; i++) {
286+
final int valLength = bytes.readVInt();
287+
scratch.offset = bytes.getPosition();
288+
scratch.length = valLength;
289+
if (terms.length == 1) {
290+
if (terms[0].bytesEquals(scratch)) {
291+
return true;
292+
}
293+
} else {
294+
final int pos = Arrays.binarySearch(terms, scratch, BytesRef::compareTo);
295+
if (pos >= 0) {
296+
assert terms[pos].bytesEquals(scratch)
297+
: "Expected term at position " + pos + " to match scratch, but it did not.";
298+
return true;
299+
}
239300
}
301+
bytes.skipBytes(valLength);
240302
}
241-
bytes.skipBytes(valLength);
242-
}
243-
assert bytes.available() == 0 : "Expected no bytes left to read, but found " + bytes.available();
244-
return false;
303+
assert bytes.available() == 0 : "Expected no bytes left to read, but found " + bytes.available();
304+
return false;
305+
};
245306
}
246307

247308
@Override
@@ -275,4 +336,43 @@ public int hashCode() {
275336
return Objects.hash(super.hashCode(), Arrays.hashCode(terms));
276337
}
277338
}
339+
340+
private interface AutomatonProvider {
341+
Automaton getAutomaton(String field);
342+
}
343+
344+
private record PatternAutomatonProvider(String matchPattern, boolean caseInsensitive) implements AutomatonProvider {
345+
@Override
346+
public Automaton getAutomaton(String field) {
347+
return caseInsensitive
348+
? AutomatonQueries.toCaseInsensitiveWildcardAutomaton(new Term(field, matchPattern))
349+
: WildcardQuery.toAutomaton(new Term(field, matchPattern), Operations.DEFAULT_DETERMINIZE_WORK_LIMIT);
350+
}
351+
}
352+
353+
private record RegexAutomatonProvider(String value, int syntaxFlags, int matchFlags, int maxDeterminizedStates)
354+
implements
355+
AutomatonProvider {
356+
@Override
357+
public Automaton getAutomaton(String field) {
358+
RegExp regex = new RegExp(value, syntaxFlags, matchFlags);
359+
return Operations.determinize(regex.toAutomaton(), maxDeterminizedStates);
360+
}
361+
}
362+
363+
private record RangeAutomatonProvider(BytesRef lower, BytesRef upper, boolean includeLower, boolean includeUpper)
364+
implements
365+
AutomatonProvider {
366+
@Override
367+
public Automaton getAutomaton(String field) {
368+
return TermRangeQuery.toAutomaton(lower, upper, includeLower, includeUpper);
369+
}
370+
}
371+
372+
private record FuzzyQueryAutomatonProvider(String searchTerm, FuzzyQuery fuzzyQuery) implements AutomatonProvider {
373+
@Override
374+
public Automaton getAutomaton(String field) {
375+
return fuzzyQuery.getAutomata().automaton;
376+
}
377+
}
278378
}

x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java

Lines changed: 7 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,6 @@
3636
import org.apache.lucene.search.Query;
3737
import org.apache.lucene.search.TermInSetQuery;
3838
import org.apache.lucene.search.TermQuery;
39-
import org.apache.lucene.search.TermRangeQuery;
4039
import org.apache.lucene.search.WildcardQuery;
4140
import org.apache.lucene.util.BytesRef;
4241
import org.apache.lucene.util.automaton.Automaton;
@@ -47,7 +46,6 @@
4746
import org.elasticsearch.common.io.stream.ByteArrayStreamInput;
4847
import org.elasticsearch.common.lucene.BytesRefs;
4948
import org.elasticsearch.common.lucene.Lucene;
50-
import org.elasticsearch.common.lucene.search.AutomatonQueries;
5149
import org.elasticsearch.common.time.DateMathParser;
5250
import org.elasticsearch.common.unit.Fuzziness;
5351
import org.elasticsearch.core.Nullable;
@@ -314,15 +312,12 @@ public Query wildcardQuery(String wildcardPattern, RewriteMethod method, boolean
314312
// We have no concrete characters and we're not a pure length query e.g. ???
315313
return new FieldExistsQuery(name());
316314
}
317-
Automaton automaton = caseInsensitive
318-
? AutomatonQueries.toCaseInsensitiveWildcardAutomaton(new Term(name(), wildcardPattern))
319-
: WildcardQuery.toAutomaton(new Term(name(), wildcardPattern), Operations.DEFAULT_DETERMINIZE_WORK_LIMIT);
320315
if (numClauses > 0) {
321316
// We can accelerate execution with the ngram query
322317
BooleanQuery approxQuery = rewritten.build();
323-
return BinaryDvConfirmedQuery.fromAutomaton(approxQuery, name(), wildcardPattern, automaton);
318+
return BinaryDvConfirmedQuery.fromWildcardQuery(approxQuery, name(), wildcardPattern, caseInsensitive);
324319
} else {
325-
return BinaryDvConfirmedQuery.fromAutomaton(new MatchAllDocsQuery(), name(), wildcardPattern, automaton);
320+
return BinaryDvConfirmedQuery.fromWildcardQuery(new MatchAllDocsQuery(), name(), wildcardPattern, caseInsensitive);
326321
}
327322
}
328323

@@ -417,11 +412,8 @@ public Query regexpQuery(
417412
Query approxBooleanQuery = toApproximationQuery(ngramRegex);
418413
Query approxNgramQuery = rewriteBoolToNgramQuery(approxBooleanQuery);
419414

420-
RegExp regex = new RegExp(value, syntaxFlags, matchFlags);
421-
Automaton automaton = Operations.determinize(regex.toAutomaton(), maxDeterminizedStates);
422-
423415
// We can accelerate execution with the ngram query
424-
return BinaryDvConfirmedQuery.fromAutomaton(approxNgramQuery, name(), value, automaton);
416+
return BinaryDvConfirmedQuery.fromRegexpQuery(approxNgramQuery, name(), value, syntaxFlags, matchFlags, maxDeterminizedStates);
425417
}
426418

427419
// Convert a regular expression to a simplified query consisting of BooleanQuery and TermQuery objects
@@ -750,12 +742,11 @@ public Query rangeQuery(
750742
}
751743
}
752744
}
753-
Automaton automaton = TermRangeQuery.toAutomaton(lower, upper, includeLower, includeUpper);
754745

755746
if (accelerationQuery == null) {
756-
return BinaryDvConfirmedQuery.fromAutomaton(new MatchAllDocsQuery(), name(), lower + "-" + upper, automaton);
747+
return BinaryDvConfirmedQuery.fromRangeQuery(new MatchAllDocsQuery(), name(), lower, upper, includeLower, includeUpper);
757748
}
758-
return BinaryDvConfirmedQuery.fromAutomaton(accelerationQuery, name(), lower + "-" + upper, automaton);
749+
return BinaryDvConfirmedQuery.fromRangeQuery(accelerationQuery, name(), lower, upper, includeLower, includeUpper);
759750
}
760751

761752
@Override
@@ -844,10 +835,9 @@ public Query fuzzyQuery(
844835
rewriteMethod
845836
);
846837
if (ngramQ.clauses().size() == 0) {
847-
return BinaryDvConfirmedQuery.fromAutomaton(new MatchAllDocsQuery(), name(), searchTerm, fq.getAutomata().automaton);
838+
return BinaryDvConfirmedQuery.fromFuzzyQuery(new MatchAllDocsQuery(), name(), searchTerm, fq);
848839
}
849-
850-
return BinaryDvConfirmedQuery.fromAutomaton(ngramQ, name(), searchTerm, fq.getAutomata().automaton);
840+
return BinaryDvConfirmedQuery.fromFuzzyQuery(ngramQ, name(), searchTerm, fq);
851841
} catch (IOException ioe) {
852842
throw new ElasticsearchParseException("Error parsing wildcard field fuzzy string [" + searchTerm + "]");
853843
}

0 commit comments

Comments
 (0)