Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions docs/changelog/136086.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
pr: 136086
summary: Delay automaton creation in `BinaryDvConfirmedQuery` to avoid OOM on queries
against `WildCard` fields
area: Search
type: bug
issues: []
Original file line number Diff line number Diff line change
Expand Up @@ -10,21 +10,28 @@
import org.apache.lucene.index.BinaryDocValues;
import org.apache.lucene.index.DocValues;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.ConstantScoreScorer;
import org.apache.lucene.search.ConstantScoreWeight;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.FuzzyQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.QueryVisitor;
import org.apache.lucene.search.ScoreMode;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.ScorerSupplier;
import org.apache.lucene.search.TermRangeQuery;
import org.apache.lucene.search.TwoPhaseIterator;
import org.apache.lucene.search.Weight;
import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.ByteRunAutomaton;
import org.apache.lucene.util.automaton.Operations;
import org.apache.lucene.util.automaton.RegExp;
import org.elasticsearch.common.io.stream.ByteArrayStreamInput;
import org.elasticsearch.common.lucene.search.AutomatonQueries;

import java.io.IOException;
import java.util.Arrays;
Expand All @@ -46,15 +53,65 @@ private BinaryDvConfirmedQuery(Query approximation, String field) {
}

/**
* Returns a query that runs the provided Automaton across all binary doc values (but only for docs that also
* match a provided approximation query which is key to getting good performance).
* Returns a query that runs the generated Automaton from a range query across
* all binary doc values (but only for docs that also match a provided approximation query which is key
* to getting good performance).
*/
public static Query fromAutomaton(Query approximation, String field, String matchPattern, Automaton automaton) {
return new BinaryDvConfirmedAutomatonQuery(approximation, field, matchPattern, automaton);
public static Query fromRangeQuery(
Query approximation,
String field,
BytesRef lower,
BytesRef upper,
boolean includeLower,
boolean includeUpper
) {
return new BinaryDvConfirmedAutomatonQuery(
approximation,
field,
new RangeAutomatonProvider(lower, upper, includeLower, includeUpper)
);
}

/**
* Returns a query that checks for equality of at leat one of the provided terms across
* Returns a query that runs the generated Automaton from a wildcard query across
* all binary doc values (but only for docs that also match a provided approximation query which is key
* to getting good performance).
*/
public static Query fromWildcardQuery(Query approximation, String field, String matchPattern, boolean caseInsensitive) {
return new BinaryDvConfirmedAutomatonQuery(approximation, field, new PatternAutomatonProvider(matchPattern, caseInsensitive));
}

/**
* Returns a query that runs the generated Automaton from a regexp query across
* all binary doc values (but only for docs that also match a provided approximation query which is key
* to getting good performance).
*/
public static Query fromRegexpQuery(
Query approximation,
String field,
String value,
int syntaxFlags,
int matchFlags,
int maxDeterminizedStates
) {
return new BinaryDvConfirmedAutomatonQuery(
approximation,
field,
new RegexAutomatonProvider(value, syntaxFlags, matchFlags, maxDeterminizedStates)
);
}

/**
* Returns a query that runs the generated Automaton from a fuzzy query across
* all binary doc values (but only for docs that also match a provided approximation query which is key
* to getting good performance).
*/
public static Query fromFuzzyQuery(Query approximation, String field, String searchTerm, FuzzyQuery fuzzyQuery) {
return new BinaryDvConfirmedAutomatonQuery(approximation, field, new FuzzyQueryAutomatonProvider(searchTerm, fuzzyQuery));
}

/**
* Returns a query that checks for equality of at least one of the provided terms across
* all binary doc values (but only for docs that also match a provided approximation query which
* is key to getting good performance).
*/
Expand All @@ -63,7 +120,7 @@ public static Query fromTerms(Query approximation, String field, BytesRef... ter
return new BinaryDvConfirmedTermsQuery(approximation, field, terms);
}

protected abstract boolean matchesBinaryDV(ByteArrayStreamInput bytes, BytesRef bytesRef, BytesRef scratch) throws IOException;
protected abstract BinaryDVMatcher getBinaryDVMatcher();

protected abstract Query rewrite(Query approxRewrite) throws IOException;

Expand All @@ -79,7 +136,7 @@ public Query rewrite(IndexSearcher searcher) throws IOException {
@Override
public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException {
final Weight approxWeight = approxQuery.createWeight(searcher, scoreMode, boost);

final BinaryDVMatcher matcher = getBinaryDVMatcher();
return new ConstantScoreWeight(this, boost) {

@Override
Expand All @@ -106,7 +163,7 @@ public boolean matches() throws IOException {
}
final BytesRef bytesRef = values.binaryValue();
bytes.reset(bytesRef.bytes, bytesRef.offset, bytesRef.length);
return matchesBinaryDV(bytes, bytesRef, scratch);
return matcher.matchesBinaryDV(bytes, bytesRef, scratch);
}

@Override
Expand Down Expand Up @@ -157,55 +214,56 @@ public void visit(QueryVisitor visitor) {
}
}

private static class BinaryDvConfirmedAutomatonQuery extends BinaryDvConfirmedQuery {
interface BinaryDVMatcher {
boolean matchesBinaryDV(ByteArrayStreamInput bytes, BytesRef bytesRef, BytesRef scratch) throws IOException;
}

private final ByteRunAutomaton byteRunAutomaton;
private final String matchPattern;
private static class BinaryDvConfirmedAutomatonQuery extends BinaryDvConfirmedQuery {

private BinaryDvConfirmedAutomatonQuery(Query approximation, String field, String matchPattern, Automaton automaton) {
this(approximation, field, matchPattern, new ByteRunAutomaton(automaton));
}
private final AutomatonProvider automatonProvider;

private BinaryDvConfirmedAutomatonQuery(Query approximation, String field, String matchPattern, ByteRunAutomaton byteRunAutomaton) {
private BinaryDvConfirmedAutomatonQuery(Query approximation, String field, AutomatonProvider automatonProvider) {
super(approximation, field);
this.matchPattern = matchPattern;
this.byteRunAutomaton = byteRunAutomaton;
this.automatonProvider = automatonProvider;
}

@Override
protected boolean matchesBinaryDV(ByteArrayStreamInput bytes, BytesRef bytesRef, BytesRef scratch) throws IOException {
int size = bytes.readVInt();
for (int i = 0; i < size; i++) {
int valLength = bytes.readVInt();
if (byteRunAutomaton.run(bytesRef.bytes, bytes.getPosition(), valLength)) {
return true;
protected BinaryDVMatcher getBinaryDVMatcher() {
final ByteRunAutomaton byteRunAutomaton = new ByteRunAutomaton(automatonProvider.getAutomaton(field));
return (bytes, bytesRef, scratch) -> {
final int size = bytes.readVInt();
for (int i = 0; i < size; i++) {
final int valLength = bytes.readVInt();
if (byteRunAutomaton.run(bytesRef.bytes, bytes.getPosition(), valLength)) {
return true;
}
bytes.skipBytes(valLength);
}
bytes.skipBytes(valLength);
}
return false;
return false;
};
}

@Override
protected Query rewrite(Query approxRewrite) {
return new BinaryDvConfirmedAutomatonQuery(approxRewrite, field, matchPattern, byteRunAutomaton);
return new BinaryDvConfirmedAutomatonQuery(approxRewrite, field, automatonProvider);
}

@Override
public String toString(String field) {
return field + ":" + matchPattern;
return field + ":" + automatonProvider.toString();
}

@Override
public boolean equals(Object o) {
if (o == null || getClass() != o.getClass()) return false;
if (super.equals(o) == false) return false;
BinaryDvConfirmedAutomatonQuery other = (BinaryDvConfirmedAutomatonQuery) o;
return Objects.equals(byteRunAutomaton, other.byteRunAutomaton) && Objects.equals(matchPattern, other.matchPattern);
return Objects.equals(automatonProvider, other.automatonProvider);
}

@Override
public int hashCode() {
return Objects.hash(super.hashCode(), matchPattern, byteRunAutomaton);
return Objects.hash(super.hashCode(), automatonProvider);
}
}

Expand All @@ -220,28 +278,31 @@ private BinaryDvConfirmedTermsQuery(Query approximation, String field, BytesRef[
}

@Override
protected boolean matchesBinaryDV(ByteArrayStreamInput bytes, BytesRef bytesRef, BytesRef scratch) throws IOException {
scratch.bytes = bytesRef.bytes;
final int size = bytes.readVInt();
for (int i = 0; i < size; i++) {
final int valLength = bytes.readVInt();
scratch.offset = bytes.getPosition();
scratch.length = valLength;
if (terms.length == 1) {
if (terms[0].bytesEquals(scratch)) {
return true;
}
} else {
final int pos = Arrays.binarySearch(terms, scratch, BytesRef::compareTo);
if (pos >= 0) {
assert terms[pos].bytesEquals(scratch) : "Expected term at position " + pos + " to match scratch, but it did not.";
return true;
protected BinaryDVMatcher getBinaryDVMatcher() {
return (bytes, bytesRef, scratch) -> {
scratch.bytes = bytesRef.bytes;
final int size = bytes.readVInt();
for (int i = 0; i < size; i++) {
final int valLength = bytes.readVInt();
scratch.offset = bytes.getPosition();
scratch.length = valLength;
if (terms.length == 1) {
if (terms[0].bytesEquals(scratch)) {
return true;
}
} else {
final int pos = Arrays.binarySearch(terms, scratch, BytesRef::compareTo);
if (pos >= 0) {
assert terms[pos].bytesEquals(scratch)
: "Expected term at position " + pos + " to match scratch, but it did not.";
return true;
}
}
bytes.skipBytes(valLength);
}
bytes.skipBytes(valLength);
}
assert bytes.available() == 0 : "Expected no bytes left to read, but found " + bytes.available();
return false;
assert bytes.available() == 0 : "Expected no bytes left to read, but found " + bytes.available();
return false;
};
}

@Override
Expand Down Expand Up @@ -275,4 +336,43 @@ public int hashCode() {
return Objects.hash(super.hashCode(), Arrays.hashCode(terms));
}
}

private interface AutomatonProvider {
Automaton getAutomaton(String field);
}

private record PatternAutomatonProvider(String matchPattern, boolean caseInsensitive) implements AutomatonProvider {
@Override
public Automaton getAutomaton(String field) {
return caseInsensitive
? AutomatonQueries.toCaseInsensitiveWildcardAutomaton(new Term(field, matchPattern))
: WildcardQuery.toAutomaton(new Term(field, matchPattern), Operations.DEFAULT_DETERMINIZE_WORK_LIMIT);
}
}

private record RegexAutomatonProvider(String value, int syntaxFlags, int matchFlags, int maxDeterminizedStates)
implements
AutomatonProvider {
@Override
public Automaton getAutomaton(String field) {
RegExp regex = new RegExp(value, syntaxFlags, matchFlags);
return Operations.determinize(regex.toAutomaton(), maxDeterminizedStates);
}
}

private record RangeAutomatonProvider(BytesRef lower, BytesRef upper, boolean includeLower, boolean includeUpper)
implements
AutomatonProvider {
@Override
public Automaton getAutomaton(String field) {
return TermRangeQuery.toAutomaton(lower, upper, includeLower, includeUpper);
}
}

private record FuzzyQueryAutomatonProvider(String searchTerm, FuzzyQuery fuzzyQuery) implements AutomatonProvider {
@Override
public Automaton getAutomaton(String field) {
return fuzzyQuery.getAutomata().automaton;
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermInSetQuery;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TermRangeQuery;
import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.automaton.Automaton;
Expand All @@ -47,7 +46,6 @@
import org.elasticsearch.common.io.stream.ByteArrayStreamInput;
import org.elasticsearch.common.lucene.BytesRefs;
import org.elasticsearch.common.lucene.Lucene;
import org.elasticsearch.common.lucene.search.AutomatonQueries;
import org.elasticsearch.common.time.DateMathParser;
import org.elasticsearch.common.unit.Fuzziness;
import org.elasticsearch.core.Nullable;
Expand Down Expand Up @@ -314,15 +312,12 @@ public Query wildcardQuery(String wildcardPattern, RewriteMethod method, boolean
// We have no concrete characters and we're not a pure length query e.g. ???
return new FieldExistsQuery(name());
}
Automaton automaton = caseInsensitive
? AutomatonQueries.toCaseInsensitiveWildcardAutomaton(new Term(name(), wildcardPattern))
: WildcardQuery.toAutomaton(new Term(name(), wildcardPattern), Operations.DEFAULT_DETERMINIZE_WORK_LIMIT);
if (numClauses > 0) {
// We can accelerate execution with the ngram query
BooleanQuery approxQuery = rewritten.build();
return BinaryDvConfirmedQuery.fromAutomaton(approxQuery, name(), wildcardPattern, automaton);
return BinaryDvConfirmedQuery.fromWildcardQuery(approxQuery, name(), wildcardPattern, caseInsensitive);
} else {
return BinaryDvConfirmedQuery.fromAutomaton(new MatchAllDocsQuery(), name(), wildcardPattern, automaton);
return BinaryDvConfirmedQuery.fromWildcardQuery(new MatchAllDocsQuery(), name(), wildcardPattern, caseInsensitive);
}
}

Expand Down Expand Up @@ -417,11 +412,8 @@ public Query regexpQuery(
Query approxBooleanQuery = toApproximationQuery(ngramRegex);
Query approxNgramQuery = rewriteBoolToNgramQuery(approxBooleanQuery);

RegExp regex = new RegExp(value, syntaxFlags, matchFlags);
Automaton automaton = Operations.determinize(regex.toAutomaton(), maxDeterminizedStates);

// We can accelerate execution with the ngram query
return BinaryDvConfirmedQuery.fromAutomaton(approxNgramQuery, name(), value, automaton);
return BinaryDvConfirmedQuery.fromRegexpQuery(approxNgramQuery, name(), value, syntaxFlags, matchFlags, maxDeterminizedStates);
}

// Convert a regular expression to a simplified query consisting of BooleanQuery and TermQuery objects
Expand Down Expand Up @@ -750,12 +742,11 @@ public Query rangeQuery(
}
}
}
Automaton automaton = TermRangeQuery.toAutomaton(lower, upper, includeLower, includeUpper);

if (accelerationQuery == null) {
return BinaryDvConfirmedQuery.fromAutomaton(new MatchAllDocsQuery(), name(), lower + "-" + upper, automaton);
return BinaryDvConfirmedQuery.fromRangeQuery(new MatchAllDocsQuery(), name(), lower, upper, includeLower, includeUpper);
}
return BinaryDvConfirmedQuery.fromAutomaton(accelerationQuery, name(), lower + "-" + upper, automaton);
return BinaryDvConfirmedQuery.fromRangeQuery(accelerationQuery, name(), lower, upper, includeLower, includeUpper);
}

@Override
Expand Down Expand Up @@ -844,10 +835,9 @@ public Query fuzzyQuery(
rewriteMethod
);
if (ngramQ.clauses().size() == 0) {
return BinaryDvConfirmedQuery.fromAutomaton(new MatchAllDocsQuery(), name(), searchTerm, fq.getAutomata().automaton);
return BinaryDvConfirmedQuery.fromFuzzyQuery(new MatchAllDocsQuery(), name(), searchTerm, fq);
}

return BinaryDvConfirmedQuery.fromAutomaton(ngramQ, name(), searchTerm, fq.getAutomata().automaton);
return BinaryDvConfirmedQuery.fromFuzzyQuery(ngramQ, name(), searchTerm, fq);
} catch (IOException ioe) {
throw new ElasticsearchParseException("Error parsing wildcard field fuzzy string [" + searchTerm + "]");
}
Expand Down
Loading