Skip to content

Commit d8b6b9c

Browse files
Auto prefiltering for queries on dense semantic_text fields (elastic#138989)
`knn` queries allow specifying `filters` that will be applied before the knn search. This `pre-filtering` allows the `knn` to return `k` results. If such filters are to applied only after the `knn` executes, then the `knn` returns the `k` matching results but the filters can filter out some of them thus potentially returning fewer than `k` results. `semantic_text` fields can be queried with: DSL - `match` queries - `semantic` queries - `knn` queries ES|QL - `match` queries - `knn` queries For DSL, `knn` queries allow users to specify direct prefilters. However, `match` and `semantic` queries provide no way to do so. Same goes for ES|QL `match`. Noting that ES|QL `KNN` already implements auto pre-filtering where conjunctions are pushed down to the `knn` query as prefilters. This commit implements semantic_text auto pre-filtering for `semantic_text` queries in DSL (`match` and `semantic` queries) and ES|QL (`MATCH`). We achieve this by adding an `AutoPrefilteringScope` object to the `SearchExecutionContext`. When we convert a `bool` query to a lucene query, we push its `must`, `filter`, and `must_not` clauses to the `AutoPrefilteringScope`. At that stage queries have already been rewritten. Semantic queries using `text_embedding` inference endpoints are rewritten to knn vector queries that are auto-prefiltering enabled. Then, when an auto-prefiltering enabled knn vector query is converted to its lucene equivalent, we fetch the prefilters from the `SearchExecutionContext` and we apply them to the knn vector query - which supports pre-filtering already. ES|QL queries that contain `MATCH` automatically benefit from this implementation because they are rewritten in `bool` queries. Limitations DSL - nested queries are excluded from pre-filtering (elastic#138184) ES|QL - filters that are not translatable to lucene queries will be applied as post-filters Relates elastic#132068
1 parent 9e09c22 commit d8b6b9c

File tree

19 files changed

+2128
-24
lines changed

19 files changed

+2128
-24
lines changed

docs/changelog/138989.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
pr: 138989
2+
summary: Auto prefiltering for queries on dense `semantic_text` fields
3+
area: Vector Search
4+
type: bug
5+
issues: []

server/src/main/java/org/elasticsearch/index/query/BoolQueryBuilder.java

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
import org.elasticsearch.common.io.stream.StreamInput;
2020
import org.elasticsearch.common.io.stream.StreamOutput;
2121
import org.elasticsearch.common.lucene.search.Queries;
22+
import org.elasticsearch.index.query.support.AutoPrefilteringScope;
2223
import org.elasticsearch.xcontent.ObjectParser;
2324
import org.elasticsearch.xcontent.ParseField;
2425
import org.elasticsearch.xcontent.XContentBuilder;
@@ -318,18 +319,33 @@ protected Query doToQuery(SearchExecutionContext context) throws IOException {
318319
return adjustPureNegative ? fixNegativeQueryIfNeeded(query) : query;
319320
}
320321

321-
private static void addBooleanClauses(
322+
private void addBooleanClauses(
322323
SearchExecutionContext context,
323324
BooleanQuery.Builder booleanQueryBuilder,
324325
List<QueryBuilder> clauses,
325326
Occur occurs
326327
) throws IOException {
327328
for (QueryBuilder query : clauses) {
328-
Query luceneQuery = query.toQuery(context);
329-
booleanQueryBuilder.add(new BooleanClause(luceneQuery, occurs));
329+
try (AutoPrefilteringScope autoPrefilteringScope = context.autoPrefilteringScope()) {
330+
autoPrefilteringScope.push(collectPrefilters(query));
331+
Query luceneQuery = query.toQuery(context);
332+
booleanQueryBuilder.add(new BooleanClause(luceneQuery, occurs));
333+
}
330334
}
331335
}
332336

337+
private List<QueryBuilder> collectPrefilters(QueryBuilder excluded) {
338+
List<QueryBuilder> prefilters = new ArrayList<>();
339+
mustClauses.stream().filter(q -> q != excluded).forEach(prefilters::add);
340+
filterClauses.stream().filter(q -> q != excluded).forEach(prefilters::add);
341+
mustNotClauses.stream().filter(q -> q != excluded).map(BoolQueryBuilder::invertQuery).forEach(prefilters::add);
342+
return prefilters;
343+
}
344+
345+
private static QueryBuilder invertQuery(QueryBuilder queryBuilder) {
346+
return QueryBuilders.boolQuery().mustNot(queryBuilder);
347+
}
348+
333349
@Override
334350
protected int doHashCode() {
335351
return Objects.hash(adjustPureNegative, minimumShouldMatch, mustClauses, shouldClauses, mustNotClauses, filterClauses);

server/src/main/java/org/elasticsearch/index/query/InterceptedQueryBuilderWrapper.java

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,15 +22,19 @@
2222
* Wrapper for instances of {@link QueryBuilder} that have been intercepted using the {@link QueryRewriteInterceptor} to
2323
* break out of the rewrite phase. These instances are unwrapped on serialization.
2424
*/
25-
class InterceptedQueryBuilderWrapper implements QueryBuilder {
25+
public class InterceptedQueryBuilderWrapper implements QueryBuilder {
2626

2727
protected final QueryBuilder queryBuilder;
2828

29-
InterceptedQueryBuilderWrapper(QueryBuilder queryBuilder) {
29+
public InterceptedQueryBuilderWrapper(QueryBuilder queryBuilder) {
3030
super();
3131
this.queryBuilder = queryBuilder;
3232
}
3333

34+
public QueryBuilder query() {
35+
return queryBuilder;
36+
}
37+
3438
@Override
3539
public QueryBuilder rewrite(QueryRewriteContext queryRewriteContext) throws IOException {
3640
QueryRewriteInterceptor queryRewriteInterceptor = queryRewriteContext.getQueryRewriteInterceptor();

server/src/main/java/org/elasticsearch/index/query/SearchExecutionContext.java

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@
4949
import org.elasticsearch.index.mapper.ParsedDocument;
5050
import org.elasticsearch.index.mapper.SourceLoader;
5151
import org.elasticsearch.index.mapper.SourceToParse;
52+
import org.elasticsearch.index.query.support.AutoPrefilteringScope;
5253
import org.elasticsearch.index.query.support.NestedScope;
5354
import org.elasticsearch.index.similarity.SimilarityService;
5455
import org.elasticsearch.script.Script;
@@ -103,6 +104,7 @@ public class SearchExecutionContext extends QueryRewriteContext {
103104

104105
private final Map<String, Query> namedQueries = new HashMap<>();
105106
private NestedScope nestedScope;
107+
private AutoPrefilteringScope autoPrefilteringScope;
106108
private QueryBuilder aliasFilter;
107109
private boolean rewriteToNamedQueries = false;
108110

@@ -291,6 +293,7 @@ private SearchExecutionContext(
291293
this.bitsetFilterCache = bitsetFilterCache;
292294
this.indexFieldDataLookup = indexFieldDataLookup;
293295
this.nestedScope = new NestedScope();
296+
this.autoPrefilteringScope = new AutoPrefilteringScope();
294297
this.searcher = searcher;
295298
this.requestSize = requestSize;
296299
this.mapperMetrics = mapperMetrics;
@@ -301,7 +304,7 @@ private void reset() {
301304
this.lookup = null;
302305
this.namedQueries.clear();
303306
this.nestedScope = new NestedScope();
304-
307+
this.autoPrefilteringScope = new AutoPrefilteringScope();
305308
}
306309

307310
// Set alias filter, so it can be applied for queries that need it (e.g. knn query)
@@ -556,6 +559,10 @@ public NestedScope nestedScope() {
556559
return nestedScope;
557560
}
558561

562+
public AutoPrefilteringScope autoPrefilteringScope() {
563+
return autoPrefilteringScope;
564+
}
565+
559566
public IndexVersion indexVersionCreated() {
560567
return indexSettings.getIndexVersionCreated();
561568
}
Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
/*
2+
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
3+
* or more contributor license agreements. Licensed under the "Elastic License
4+
* 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side
5+
* Public License v 1"; you may not use this file except in compliance with, at
6+
* your election, the "Elastic License 2.0", the "GNU Affero General Public
7+
* License v3.0 only", or the "Server Side Public License, v 1".
8+
*/
9+
10+
package org.elasticsearch.index.query.support;
11+
12+
import org.elasticsearch.core.Releasable;
13+
import org.elasticsearch.index.query.QueryBuilder;
14+
import org.elasticsearch.index.query.SearchExecutionContext;
15+
16+
import java.util.Deque;
17+
import java.util.LinkedList;
18+
import java.util.List;
19+
20+
/**
21+
* Keeps track of queries to be used as prefilters.
22+
* During {@link QueryBuilder#toQuery(SearchExecutionContext)}, each query pushes queries to be used
23+
* as prefilters to the {@link AutoPrefilteringScope}. Queries that need to apply prefilters can
24+
* fetch them by calling {@link #getPrefilters()}.
25+
*
26+
* The scope is implemented as a stack {@link Deque} of lists of prefilters.
27+
* As we move down the query tree, each query may push a list of prefilters.
28+
* A query that consumes prefilters fetches a flattened list of all prefilters in scope via {@link #getPrefilters()}.
29+
* When the query leaves the scope, {@link #pop()} should be called to remove the latest list of prefilters from the stack.
30+
* This way queries in other query tree branches will not fetch irrelevant prefilters.
31+
*/
32+
public final class AutoPrefilteringScope implements Releasable {
33+
34+
private final Deque<List<QueryBuilder>> prefiltersStack = new LinkedList<>();
35+
36+
/**
37+
* Pushes a list of prefilters to the scope.
38+
*/
39+
public void push(List<QueryBuilder> prefilters) {
40+
prefiltersStack.push(prefilters);
41+
}
42+
43+
/**
44+
* Removes the latest list of prefilters from the scope.
45+
*/
46+
public void pop() {
47+
prefiltersStack.pop();
48+
}
49+
50+
/**
51+
* Returns all prefilters in scope.
52+
*/
53+
public List<QueryBuilder> getPrefilters() {
54+
return prefiltersStack.stream().flatMap(List::stream).toList();
55+
}
56+
57+
@Override
58+
public void close() {
59+
pop();
60+
}
61+
}
Lines changed: 142 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,142 @@
1+
/*
2+
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
3+
* or more contributor license agreements. Licensed under the "Elastic License
4+
* 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side
5+
* Public License v 1"; you may not use this file except in compliance with, at
6+
* your election, the "Elastic License 2.0", the "GNU Affero General Public
7+
* License v3.0 only", or the "Server Side Public License, v 1".
8+
*/
9+
10+
package org.elasticsearch.index.query.support;
11+
12+
import org.elasticsearch.common.lucene.search.Queries;
13+
import org.elasticsearch.index.query.BoolQueryBuilder;
14+
import org.elasticsearch.index.query.BoostingQueryBuilder;
15+
import org.elasticsearch.index.query.ConstantScoreQueryBuilder;
16+
import org.elasticsearch.index.query.DisMaxQueryBuilder;
17+
import org.elasticsearch.index.query.InterceptedQueryBuilderWrapper;
18+
import org.elasticsearch.index.query.NestedQueryBuilder;
19+
import org.elasticsearch.index.query.QueryBuilder;
20+
import org.elasticsearch.index.query.functionscore.FunctionScoreQueryBuilder;
21+
22+
import java.util.List;
23+
import java.util.Optional;
24+
import java.util.Set;
25+
26+
public final class AutoPrefilteringUtils {
27+
28+
private AutoPrefilteringUtils() {}
29+
30+
/**
31+
* Prunes any query branch on queries whose type is included in the prunedTypes set.
32+
* Note that queries may not preserve their scoring parameters as this assumes the returned
33+
* query is going to be used for filtering only.
34+
*
35+
* @param query the root query to start pruning from
36+
* @param prunedTypes the types of queries to prune
37+
* @return an {@link Optional} containing the pruned query, or {@link Optional#empty()} if the query was pruned to nothing.
38+
*/
39+
public static Optional<QueryBuilder> pruneQuery(QueryBuilder query, Set<Class<? extends QueryBuilder>> prunedTypes) {
40+
if (prunedTypes.contains(query.getClass())) {
41+
// Matched a pruned query type, prune away!
42+
return Optional.empty();
43+
}
44+
45+
return switch (query) {
46+
case BoolQueryBuilder boolQuery -> pruneBoolQuery(boolQuery, prunedTypes);
47+
// We only need the positive query here as the negative query is used for scoring only - we are filtering.
48+
case BoostingQueryBuilder boostingQuery -> pruneQuery(boostingQuery.positiveQuery(), prunedTypes);
49+
case ConstantScoreQueryBuilder constantScoreQuery -> {
50+
Optional<QueryBuilder> pruned = pruneQuery(constantScoreQuery.innerQuery(), prunedTypes);
51+
yield pruned.map(q -> q == constantScoreQuery.innerQuery() ? constantScoreQuery : new ConstantScoreQueryBuilder(q));
52+
}
53+
case DisMaxQueryBuilder disMaxQuery -> pruneDisMaxQuery(disMaxQuery, prunedTypes);
54+
case FunctionScoreQueryBuilder functionScoreQuery -> pruneFunctionScoreQuery(functionScoreQuery, prunedTypes);
55+
case InterceptedQueryBuilderWrapper interceptedQuery -> {
56+
Optional<QueryBuilder> pruned = pruneQuery(interceptedQuery.query(), prunedTypes);
57+
yield pruned.map(q -> q == interceptedQuery.query() ? interceptedQuery : new InterceptedQueryBuilderWrapper(q));
58+
}
59+
case NestedQueryBuilder nestedQuery -> {
60+
Optional<QueryBuilder> pruned = pruneQuery(nestedQuery.query(), prunedTypes);
61+
yield pruned.map(
62+
q -> q == nestedQuery.query() ? nestedQuery : new NestedQueryBuilder(nestedQuery.path(), q, nestedQuery.scoreMode())
63+
);
64+
}
65+
default -> Optional.of(query);
66+
};
67+
}
68+
69+
private static Optional<QueryBuilder> pruneBoolQuery(BoolQueryBuilder boolQuery, Set<Class<? extends QueryBuilder>> prunedTypes) {
70+
BoolQueryBuilder prunedBool = new BoolQueryBuilder();
71+
pruneQueries(boolQuery.must(), prunedTypes).forEach(prunedBool::must);
72+
pruneQueries(boolQuery.should(), prunedTypes).forEach(prunedBool::should);
73+
pruneQueries(boolQuery.filter(), prunedTypes).forEach(prunedBool::filter);
74+
pruneQueries(boolQuery.mustNot(), prunedTypes).forEach(prunedBool::mustNot);
75+
adjustMinimumShouldMatchForPrunedShouldClauses(boolQuery, prunedBool);
76+
77+
if (prunedBool.equals(boolQuery)) {
78+
return Optional.of(boolQuery);
79+
}
80+
81+
// No need to preserve scoring parameters for filtering.
82+
return prunedBool.hasClauses() ? Optional.of(prunedBool) : Optional.empty();
83+
}
84+
85+
private static void adjustMinimumShouldMatchForPrunedShouldClauses(BoolQueryBuilder originalBool, BoolQueryBuilder prunedBool) {
86+
if (originalBool.minimumShouldMatch() == null) {
87+
return;
88+
}
89+
if (prunedBool.should().size() == originalBool.should().size()) {
90+
prunedBool.minimumShouldMatch(originalBool.minimumShouldMatch());
91+
} else {
92+
int originalMsm = Queries.calculateMinShouldMatch(originalBool.should().size(), originalBool.minimumShouldMatch());
93+
int numPrunedClauses = originalBool.should().size() - prunedBool.should().size();
94+
// We need to adjust the minimum should match to account for the pruned clauses.
95+
// We considered the following approaches:
96+
// 1. strict approach: set to min(remaining_should_clauses, original_msm)
97+
// 2. lenient approach: if msm is set and at least one should clause is pruned, prune all should clauses.
98+
// 3. middle ground approach: set to max(0, original_msm - remaining_should_clauses)
99+
// Let us imagine a query with 5 should clauses. 2 get pruned. msm is 3. 1 remaining clause matches.
100+
// Approach 1 would make the entire bool query to not match as we would retain msm of 3 but only 1 clause would match.
101+
// We do not know whether the pruned clauses would match or not. Thus, this approach seems too restrictive.
102+
// Approach 2 would mean we prune all should clauses and the query would match,
103+
// even if none of the remaining should clauses match.
104+
// Approach 3 would mean we adjust the msm to 3 - 2 = 1. This would mean that the query would match if at least one
105+
// of the remaining clauses matches.
106+
// We opt for the lenient approach. It is as if we assume the pruned clauses matched. Seems to be the best compromise.
107+
int prunedMsm = Math.max(0, originalMsm - numPrunedClauses);
108+
prunedBool.minimumShouldMatch(prunedMsm);
109+
}
110+
}
111+
112+
private static Optional<QueryBuilder> pruneDisMaxQuery(DisMaxQueryBuilder disMaxQuery, Set<Class<? extends QueryBuilder>> prunedTypes) {
113+
DisMaxQueryBuilder builder = new DisMaxQueryBuilder();
114+
for (QueryBuilder innerQuery : disMaxQuery.innerQueries()) {
115+
Optional<QueryBuilder> pruned = pruneQuery(innerQuery, prunedTypes);
116+
pruned.ifPresent(builder::add);
117+
}
118+
if (builder.innerQueries().equals(disMaxQuery.innerQueries())) {
119+
return Optional.of(disMaxQuery);
120+
}
121+
// No need to preserve tiebreaks for filtering.
122+
return builder.innerQueries().isEmpty() ? Optional.empty() : Optional.of(builder);
123+
}
124+
125+
private static Optional<QueryBuilder> pruneFunctionScoreQuery(
126+
FunctionScoreQueryBuilder functionScoreQuery,
127+
Set<Class<? extends QueryBuilder>> prunedTypes
128+
) {
129+
// We could remove the function score entirely as it should not be helpful for filtering,
130+
// but leaving it in for now to preserve the original query.
131+
Optional<QueryBuilder> pruned = pruneQuery(functionScoreQuery.query(), prunedTypes);
132+
return pruned.map(
133+
q -> q == functionScoreQuery.query()
134+
? functionScoreQuery
135+
: new FunctionScoreQueryBuilder(q, functionScoreQuery.filterFunctionBuilders())
136+
);
137+
}
138+
139+
private static List<QueryBuilder> pruneQueries(List<QueryBuilder> queries, Set<Class<? extends QueryBuilder>> prunedTypes) {
140+
return queries.stream().map(q -> pruneQuery(q, prunedTypes)).filter(Optional::isPresent).map(Optional::get).toList();
141+
}
142+
}

0 commit comments

Comments
 (0)