From 2c986431d68d4cbd98897b459d399bf300eb0dcd Mon Sep 17 00:00:00 2001 From: Ignacio Vera Date: Tue, 7 Oct 2025 15:45:07 +0200 Subject: [PATCH 1/2] Delay automaton creation in BinaryDvConfirmedQuery --- .../mapper/BinaryDvConfirmedQuery.java | 200 +++++++++++++----- .../wildcard/mapper/WildcardFieldMapper.java | 24 +-- .../mapper/WildcardFieldMapperTests.java | 13 +- 3 files changed, 160 insertions(+), 77 deletions(-) diff --git a/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/BinaryDvConfirmedQuery.java b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/BinaryDvConfirmedQuery.java index cfd5a141a5128..aa6f944cb2a00 100644 --- a/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/BinaryDvConfirmedQuery.java +++ b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/BinaryDvConfirmedQuery.java @@ -10,21 +10,28 @@ import org.apache.lucene.index.BinaryDocValues; import org.apache.lucene.index.DocValues; import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.Term; import org.apache.lucene.search.ConstantScoreScorer; import org.apache.lucene.search.ConstantScoreWeight; import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.search.FuzzyQuery; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.QueryVisitor; import org.apache.lucene.search.ScoreMode; import org.apache.lucene.search.Scorer; import org.apache.lucene.search.ScorerSupplier; +import org.apache.lucene.search.TermRangeQuery; import org.apache.lucene.search.TwoPhaseIterator; import org.apache.lucene.search.Weight; +import org.apache.lucene.search.WildcardQuery; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.automaton.Automaton; import org.apache.lucene.util.automaton.ByteRunAutomaton; +import org.apache.lucene.util.automaton.Operations; +import org.apache.lucene.util.automaton.RegExp; import org.elasticsearch.common.io.stream.ByteArrayStreamInput; +import org.elasticsearch.common.lucene.search.AutomatonQueries; import java.io.IOException; import java.util.Arrays; @@ -46,15 +53,65 @@ private BinaryDvConfirmedQuery(Query approximation, String field) { } /** - * Returns a query that runs the provided Automaton across all binary doc values (but only for docs that also - * match a provided approximation query which is key to getting good performance). + * Returns a query that runs the generated Automaton from a range query across + * all binary doc values (but only for docs that also match a provided approximation query which is key + * to getting good performance). */ - public static Query fromAutomaton(Query approximation, String field, String matchPattern, Automaton automaton) { - return new BinaryDvConfirmedAutomatonQuery(approximation, field, matchPattern, automaton); + public static Query fromRangeQuery( + Query approximation, + String field, + BytesRef lower, + BytesRef upper, + boolean includeLower, + boolean includeUpper + ) { + return new BinaryDvConfirmedAutomatonQuery( + approximation, + field, + new RangeAutomatonProvider(lower, upper, includeLower, includeUpper) + ); } /** - * Returns a query that checks for equality of at leat one of the provided terms across + * Returns a query that runs the generated Automaton from a wildcard query across + * all binary doc values (but only for docs that also match a provided approximation query which is key + * to getting good performance). + */ + public static Query fromWildcardQuery(Query approximation, String field, String matchPattern, boolean caseInsensitive) { + return new BinaryDvConfirmedAutomatonQuery(approximation, field, new PatternAutomatonProvider(matchPattern, caseInsensitive)); + } + + /** + * Returns a query that runs the generated Automaton from a regexp query across + * all binary doc values (but only for docs that also match a provided approximation query which is key + * to getting good performance). + */ + public static Query fromRegexpQuery( + Query approximation, + String field, + String value, + int syntaxFlags, + int matchFlags, + int maxDeterminizedStates + ) { + return new BinaryDvConfirmedAutomatonQuery( + approximation, + field, + new RegexAutomatonProvider(value, syntaxFlags, matchFlags, maxDeterminizedStates) + ); + } + + /** + * Returns a query that runs the generated Automaton from a fuzzy query across + * all binary doc values (but only for docs that also match a provided approximation query which is key + * to getting good performance). + */ + public static Query fromFuzzyQuery(Query approximation, String field, String searchTerm, FuzzyQuery fuzzyQuery) { + return new BinaryDvConfirmedAutomatonQuery(approximation, field, new FuzzyQueryAutomatonProvider(searchTerm, fuzzyQuery)); + } + + /** + * Returns a query that checks for equality of at least one of the provided terms across * all binary doc values (but only for docs that also match a provided approximation query which * is key to getting good performance). */ @@ -63,7 +120,7 @@ public static Query fromTerms(Query approximation, String field, BytesRef... ter return new BinaryDvConfirmedTermsQuery(approximation, field, terms); } - protected abstract boolean matchesBinaryDV(ByteArrayStreamInput bytes, BytesRef bytesRef, BytesRef scratch) throws IOException; + protected abstract BinaryDVMatcher getBinaryDVMatcher(); protected abstract Query rewrite(Query approxRewrite) throws IOException; @@ -79,7 +136,7 @@ public Query rewrite(IndexSearcher searcher) throws IOException { @Override public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException { final Weight approxWeight = approxQuery.createWeight(searcher, scoreMode, boost); - + final BinaryDVMatcher matcher = getBinaryDVMatcher(); return new ConstantScoreWeight(this, boost) { @Override @@ -106,7 +163,7 @@ public boolean matches() throws IOException { } final BytesRef bytesRef = values.binaryValue(); bytes.reset(bytesRef.bytes, bytesRef.offset, bytesRef.length); - return matchesBinaryDV(bytes, bytesRef, scratch); + return matcher.matchesBinaryDV(bytes, bytesRef, scratch); } @Override @@ -157,42 +214,43 @@ public void visit(QueryVisitor visitor) { } } - private static class BinaryDvConfirmedAutomatonQuery extends BinaryDvConfirmedQuery { + interface BinaryDVMatcher { + boolean matchesBinaryDV(ByteArrayStreamInput bytes, BytesRef bytesRef, BytesRef scratch) throws IOException; + } - private final ByteRunAutomaton byteRunAutomaton; - private final String matchPattern; + private static class BinaryDvConfirmedAutomatonQuery extends BinaryDvConfirmedQuery { - private BinaryDvConfirmedAutomatonQuery(Query approximation, String field, String matchPattern, Automaton automaton) { - this(approximation, field, matchPattern, new ByteRunAutomaton(automaton)); - } + private final AutomatonProvider automatonProvider; - private BinaryDvConfirmedAutomatonQuery(Query approximation, String field, String matchPattern, ByteRunAutomaton byteRunAutomaton) { + private BinaryDvConfirmedAutomatonQuery(Query approximation, String field, AutomatonProvider automatonProvider) { super(approximation, field); - this.matchPattern = matchPattern; - this.byteRunAutomaton = byteRunAutomaton; + this.automatonProvider = automatonProvider; } @Override - protected boolean matchesBinaryDV(ByteArrayStreamInput bytes, BytesRef bytesRef, BytesRef scratch) throws IOException { - int size = bytes.readVInt(); - for (int i = 0; i < size; i++) { - int valLength = bytes.readVInt(); - if (byteRunAutomaton.run(bytesRef.bytes, bytes.getPosition(), valLength)) { - return true; + protected BinaryDVMatcher getBinaryDVMatcher() { + final ByteRunAutomaton byteRunAutomaton = new ByteRunAutomaton(automatonProvider.getAutomaton(field)); + return (bytes, bytesRef, scratch) -> { + final int size = bytes.readVInt(); + for (int i = 0; i < size; i++) { + final int valLength = bytes.readVInt(); + if (byteRunAutomaton.run(bytesRef.bytes, bytes.getPosition(), valLength)) { + return true; + } + bytes.skipBytes(valLength); } - bytes.skipBytes(valLength); - } - return false; + return false; + }; } @Override protected Query rewrite(Query approxRewrite) { - return new BinaryDvConfirmedAutomatonQuery(approxRewrite, field, matchPattern, byteRunAutomaton); + return new BinaryDvConfirmedAutomatonQuery(approxRewrite, field, automatonProvider); } @Override public String toString(String field) { - return field + ":" + matchPattern; + return field + ":" + automatonProvider.toString(); } @Override @@ -200,12 +258,12 @@ public boolean equals(Object o) { if (o == null || getClass() != o.getClass()) return false; if (super.equals(o) == false) return false; BinaryDvConfirmedAutomatonQuery other = (BinaryDvConfirmedAutomatonQuery) o; - return Objects.equals(byteRunAutomaton, other.byteRunAutomaton) && Objects.equals(matchPattern, other.matchPattern); + return Objects.equals(automatonProvider, other.automatonProvider); } @Override public int hashCode() { - return Objects.hash(super.hashCode(), matchPattern, byteRunAutomaton); + return Objects.hash(super.hashCode(), automatonProvider); } } @@ -220,28 +278,31 @@ private BinaryDvConfirmedTermsQuery(Query approximation, String field, BytesRef[ } @Override - protected boolean matchesBinaryDV(ByteArrayStreamInput bytes, BytesRef bytesRef, BytesRef scratch) throws IOException { - scratch.bytes = bytesRef.bytes; - final int size = bytes.readVInt(); - for (int i = 0; i < size; i++) { - final int valLength = bytes.readVInt(); - scratch.offset = bytes.getPosition(); - scratch.length = valLength; - if (terms.length == 1) { - if (terms[0].bytesEquals(scratch)) { - return true; - } - } else { - final int pos = Arrays.binarySearch(terms, scratch, BytesRef::compareTo); - if (pos >= 0) { - assert terms[pos].bytesEquals(scratch) : "Expected term at position " + pos + " to match scratch, but it did not."; - return true; + protected BinaryDVMatcher getBinaryDVMatcher() { + return (bytes, bytesRef, scratch) -> { + scratch.bytes = bytesRef.bytes; + final int size = bytes.readVInt(); + for (int i = 0; i < size; i++) { + final int valLength = bytes.readVInt(); + scratch.offset = bytes.getPosition(); + scratch.length = valLength; + if (terms.length == 1) { + if (terms[0].bytesEquals(scratch)) { + return true; + } + } else { + final int pos = Arrays.binarySearch(terms, scratch, BytesRef::compareTo); + if (pos >= 0) { + assert terms[pos].bytesEquals(scratch) + : "Expected term at position " + pos + " to match scratch, but it did not."; + return true; + } } + bytes.skipBytes(valLength); } - bytes.skipBytes(valLength); - } - assert bytes.available() == 0 : "Expected no bytes left to read, but found " + bytes.available(); - return false; + assert bytes.available() == 0 : "Expected no bytes left to read, but found " + bytes.available(); + return false; + }; } @Override @@ -275,4 +336,43 @@ public int hashCode() { return Objects.hash(super.hashCode(), Arrays.hashCode(terms)); } } + + private interface AutomatonProvider { + Automaton getAutomaton(String field); + } + + private record PatternAutomatonProvider(String matchPattern, boolean caseInsensitive) implements AutomatonProvider { + @Override + public Automaton getAutomaton(String field) { + return caseInsensitive + ? AutomatonQueries.toCaseInsensitiveWildcardAutomaton(new Term(field, matchPattern)) + : WildcardQuery.toAutomaton(new Term(field, matchPattern), Operations.DEFAULT_DETERMINIZE_WORK_LIMIT); + } + } + + private record RegexAutomatonProvider(String value, int syntaxFlags, int matchFlags, int maxDeterminizedStates) + implements + AutomatonProvider { + @Override + public Automaton getAutomaton(String field) { + RegExp regex = new RegExp(value, syntaxFlags, matchFlags); + return Operations.determinize(regex.toAutomaton(), maxDeterminizedStates); + } + } + + private record RangeAutomatonProvider(BytesRef lower, BytesRef upper, boolean includeLower, boolean includeUpper) + implements + AutomatonProvider { + @Override + public Automaton getAutomaton(String field) { + return TermRangeQuery.toAutomaton(lower, upper, includeLower, includeUpper); + } + } + + private record FuzzyQueryAutomatonProvider(String searchTerm, FuzzyQuery fuzzyQuery) implements AutomatonProvider { + @Override + public Automaton getAutomaton(String field) { + return fuzzyQuery.getAutomata().automaton; + } + } } diff --git a/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java index 636f856e78f89..4a289a6d97bfb 100644 --- a/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java +++ b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java @@ -36,7 +36,6 @@ import org.apache.lucene.search.Query; import org.apache.lucene.search.TermInSetQuery; import org.apache.lucene.search.TermQuery; -import org.apache.lucene.search.TermRangeQuery; import org.apache.lucene.search.WildcardQuery; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.automaton.Automaton; @@ -47,7 +46,6 @@ import org.elasticsearch.common.io.stream.ByteArrayStreamInput; import org.elasticsearch.common.lucene.BytesRefs; import org.elasticsearch.common.lucene.Lucene; -import org.elasticsearch.common.lucene.search.AutomatonQueries; import org.elasticsearch.common.time.DateMathParser; import org.elasticsearch.common.unit.Fuzziness; import org.elasticsearch.core.Nullable; @@ -314,15 +312,12 @@ public Query wildcardQuery(String wildcardPattern, RewriteMethod method, boolean // We have no concrete characters and we're not a pure length query e.g. ??? return new FieldExistsQuery(name()); } - Automaton automaton = caseInsensitive - ? AutomatonQueries.toCaseInsensitiveWildcardAutomaton(new Term(name(), wildcardPattern)) - : WildcardQuery.toAutomaton(new Term(name(), wildcardPattern), Operations.DEFAULT_DETERMINIZE_WORK_LIMIT); if (numClauses > 0) { // We can accelerate execution with the ngram query BooleanQuery approxQuery = rewritten.build(); - return BinaryDvConfirmedQuery.fromAutomaton(approxQuery, name(), wildcardPattern, automaton); + return BinaryDvConfirmedQuery.fromWildcardQuery(approxQuery, name(), wildcardPattern, caseInsensitive); } else { - return BinaryDvConfirmedQuery.fromAutomaton(new MatchAllDocsQuery(), name(), wildcardPattern, automaton); + return BinaryDvConfirmedQuery.fromWildcardQuery(new MatchAllDocsQuery(), name(), wildcardPattern, caseInsensitive); } } @@ -417,11 +412,8 @@ public Query regexpQuery( Query approxBooleanQuery = toApproximationQuery(ngramRegex); Query approxNgramQuery = rewriteBoolToNgramQuery(approxBooleanQuery); - RegExp regex = new RegExp(value, syntaxFlags, matchFlags); - Automaton automaton = Operations.determinize(regex.toAutomaton(), maxDeterminizedStates); - // We can accelerate execution with the ngram query - return BinaryDvConfirmedQuery.fromAutomaton(approxNgramQuery, name(), value, automaton); + return BinaryDvConfirmedQuery.fromRegexpQuery(approxNgramQuery, name(), value, syntaxFlags, matchFlags, maxDeterminizedStates); } // Convert a regular expression to a simplified query consisting of BooleanQuery and TermQuery objects @@ -750,12 +742,11 @@ public Query rangeQuery( } } } - Automaton automaton = TermRangeQuery.toAutomaton(lower, upper, includeLower, includeUpper); if (accelerationQuery == null) { - return BinaryDvConfirmedQuery.fromAutomaton(new MatchAllDocsQuery(), name(), lower + "-" + upper, automaton); + return BinaryDvConfirmedQuery.fromRangeQuery(new MatchAllDocsQuery(), name(), lower, upper, includeLower, includeUpper); } - return BinaryDvConfirmedQuery.fromAutomaton(accelerationQuery, name(), lower + "-" + upper, automaton); + return BinaryDvConfirmedQuery.fromRangeQuery(accelerationQuery, name(), lower, upper, includeLower, includeUpper); } @Override @@ -844,10 +835,9 @@ public Query fuzzyQuery( rewriteMethod ); if (ngramQ.clauses().size() == 0) { - return BinaryDvConfirmedQuery.fromAutomaton(new MatchAllDocsQuery(), name(), searchTerm, fq.getAutomata().automaton); + return BinaryDvConfirmedQuery.fromFuzzyQuery(new MatchAllDocsQuery(), name(), searchTerm, fq); } - - return BinaryDvConfirmedQuery.fromAutomaton(ngramQ, name(), searchTerm, fq.getAutomata().automaton); + return BinaryDvConfirmedQuery.fromFuzzyQuery(ngramQ, name(), searchTerm, fq); } catch (IOException ioe) { throw new ElasticsearchParseException("Error parsing wildcard field fuzzy string [" + searchTerm + "]"); } diff --git a/x-pack/plugin/wildcard/src/test/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapperTests.java b/x-pack/plugin/wildcard/src/test/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapperTests.java index 31f228ae6bd7e..be216c55d49cf 100644 --- a/x-pack/plugin/wildcard/src/test/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapperTests.java +++ b/x-pack/plugin/wildcard/src/test/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapperTests.java @@ -44,7 +44,6 @@ import org.apache.lucene.util.automaton.Operations; import org.apache.lucene.util.automaton.RegExp; import org.elasticsearch.cluster.metadata.IndexMetadata; -import org.elasticsearch.common.lucene.search.AutomatonQueries; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.unit.Fuzziness; import org.elasticsearch.core.Tuple; @@ -678,19 +677,13 @@ public void testWildcardAcceleration() throws IOException, ParseException { public void testQueryCachingEqualityFromAutomaton() { String pattern = "A*b*B?a"; // Case sensitivity matters when it comes to caching - Automaton caseSensitiveAutomaton = WildcardQuery.toAutomaton(new Term("field", pattern), Operations.DEFAULT_DETERMINIZE_WORK_LIMIT); - Automaton caseInSensitiveAutomaton = AutomatonQueries.toCaseInsensitiveWildcardAutomaton(new Term("field", pattern)); - Query csQ = BinaryDvConfirmedQuery.fromAutomaton(new MatchAllDocsQuery(), "field", pattern, caseSensitiveAutomaton); - Query ciQ = BinaryDvConfirmedQuery.fromAutomaton(new MatchAllDocsQuery(), "field", pattern, caseInSensitiveAutomaton); + Query csQ = BinaryDvConfirmedQuery.fromWildcardQuery(new MatchAllDocsQuery(), "field", pattern, false); + Query ciQ = BinaryDvConfirmedQuery.fromWildcardQuery(new MatchAllDocsQuery(), "field", pattern, true); assertNotEquals(csQ, ciQ); assertNotEquals(csQ.hashCode(), ciQ.hashCode()); // Same query should be equal - Automaton caseSensitiveAutomaton2 = WildcardQuery.toAutomaton( - new Term("field", pattern), - Operations.DEFAULT_DETERMINIZE_WORK_LIMIT - ); - Query csQ2 = BinaryDvConfirmedQuery.fromAutomaton(new MatchAllDocsQuery(), "field", pattern, caseSensitiveAutomaton2); + Query csQ2 = BinaryDvConfirmedQuery.fromWildcardQuery(new MatchAllDocsQuery(), "field", pattern, false); assertEquals(csQ, csQ2); assertEquals(csQ.hashCode(), csQ2.hashCode()); } From 9f12bf692340115bf1ad767d2dc738e54d104a0e Mon Sep 17 00:00:00 2001 From: Ignacio Vera Date: Tue, 7 Oct 2025 15:51:44 +0200 Subject: [PATCH 2/2] Update docs/changelog/136086.yaml --- docs/changelog/136086.yaml | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 docs/changelog/136086.yaml diff --git a/docs/changelog/136086.yaml b/docs/changelog/136086.yaml new file mode 100644 index 0000000000000..89d8b559bca99 --- /dev/null +++ b/docs/changelog/136086.yaml @@ -0,0 +1,6 @@ +pr: 136086 +summary: Delay automaton creation in `BinaryDvConfirmedQuery` to avoid OOM on queries + against `WildCard` fields +area: Search +type: bug +issues: []