Skip to content

Commit b5d5229

Browse files
authored
Add support for nested queries for ivf indices (#128782)
This does a first pass at adding nested query support for bbq_ivf indices. The support is pretty simple right now, basically, we keep exploring until we at least get `k` results to cover the case when the nested docs are all tightly clustered and the typical `nprobe` explores too few clusters to actually get `k` docs. I have some weird test failures I need to debug, so opening as draft for now.
1 parent 7ec8fcc commit b5d5229

File tree

13 files changed

+1008
-89
lines changed

13 files changed

+1008
-89
lines changed
Lines changed: 193 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,193 @@
1+
setup:
2+
- requires:
3+
cluster_features: "mapper.ivf_nested_support"
4+
reason: 'ivf nested support required'
5+
- do:
6+
indices.create:
7+
index: test
8+
body:
9+
settings:
10+
index:
11+
number_of_shards: 1
12+
mappings:
13+
properties:
14+
name:
15+
type: keyword
16+
nested:
17+
type: nested
18+
properties:
19+
paragraph_id:
20+
type: keyword
21+
vector:
22+
type: dense_vector
23+
dims: 5
24+
index: true
25+
similarity: l2_norm
26+
index_options:
27+
type: bbq_ivf
28+
29+
aliases:
30+
my_alias:
31+
filter:
32+
term:
33+
name: "rabbit.jpg"
34+
35+
- do:
36+
index:
37+
index: test
38+
id: "1"
39+
body:
40+
name: cow.jpg
41+
nested:
42+
- paragraph_id: 0
43+
vector: [230, 300.33, -34.8988, 15.555, -200]
44+
- paragraph_id: 1
45+
vector: [240, 300, -3, 1, -20]
46+
47+
- do:
48+
index:
49+
index: test
50+
id: "2"
51+
body:
52+
name: moose.jpg
53+
nested:
54+
- paragraph_id: 0
55+
vector: [-0.5, 100, -13, 14.8, -156]
56+
- paragraph_id: 2
57+
vector: [0, 100, 0, 14.8, -156]
58+
59+
- do:
60+
index:
61+
index: test
62+
id: "3"
63+
body:
64+
name: rabbit.jpg
65+
nested:
66+
- paragraph_id: 0
67+
vector: [0.5, 111.3, -13, 14.8, -156]
68+
- do:
69+
indices.forcemerge:
70+
index: test
71+
max_num_segments: 1
72+
73+
- do:
74+
indices.refresh: {}
75+
76+
---
77+
"nested kNN search that returns diverse parents docs":
78+
- do:
79+
search:
80+
index: test
81+
body:
82+
fields: [ "name" ]
83+
query:
84+
nested:
85+
path: nested
86+
query:
87+
knn:
88+
field: nested.vector
89+
query_vector: [-0.5, 90, -10, 14.8, -156]
90+
num_candidates: 3
91+
- match: {hits.total.value: 3}
92+
93+
- match: {hits.hits.0._id: "2"}
94+
- match: {hits.hits.0.fields.name.0: "moose.jpg"}
95+
96+
- match: {hits.hits.1._id: "3"}
97+
- match: {hits.hits.1.fields.name.0: "rabbit.jpg"}
98+
99+
- do:
100+
search:
101+
index: test
102+
body:
103+
fields: [ "name" ]
104+
query:
105+
nested:
106+
path: nested
107+
query:
108+
knn:
109+
field: nested.vector
110+
query_vector: [ -0.5, 90, -10, 14.8, -156 ]
111+
num_candidates: 3
112+
inner_hits: { size: 1, "fields": [ "nested.paragraph_id" ], _source: false }
113+
114+
- match: {hits.total.value: 3}
115+
116+
- match: { hits.hits.0._id: "2" }
117+
- match: { hits.hits.0.fields.name.0: "moose.jpg" }
118+
- match: { hits.hits.0.inner_hits.nested.hits.hits.0.fields.nested.0.paragraph_id.0: "0" }
119+
120+
- match: { hits.hits.1._id: "3" }
121+
- match: { hits.hits.1.fields.name.0: "rabbit.jpg" }
122+
- match: { hits.hits.1.inner_hits.nested.hits.hits.0.fields.nested.0.paragraph_id.0: "0" }
123+
124+
- match: { hits.hits.2._id: "1" }
125+
- match: { hits.hits.2.fields.name.0: "cow.jpg" }
126+
- match: { hits.hits.2.inner_hits.nested.hits.hits.0.fields.nested.0.paragraph_id.0: "0" }
127+
128+
---
129+
"nested kNN search pre-filtered on alias with filter on top level fields":
130+
- do:
131+
search:
132+
index: my_alias # filter on name: "rabbit.jpg"
133+
body:
134+
fields: [ "name" ]
135+
query:
136+
nested:
137+
path: nested
138+
query:
139+
knn:
140+
field: nested.vector
141+
query_vector: [ -0.5, 90.0, -10, 14.8, -156.0 ]
142+
num_candidates: 1
143+
inner_hits: { size: 1, "fields": [ "nested.paragraph_id" ], _source: false }
144+
145+
- match: {hits.total.value: 1} # as alias is passed as pre-filter, we get a single result
146+
- match: {hits.hits.0._id: "3"}
147+
- match: {hits.hits.0.fields.name.0: "rabbit.jpg"}
148+
- match: { hits.hits.0.inner_hits.nested.hits.hits.0.fields.nested.0.paragraph_id.0: "0" }
149+
150+
---
151+
"nested kNN search post-filtered on top level fields":
152+
- do:
153+
search:
154+
index: test
155+
body:
156+
fields: [ "name" ]
157+
query:
158+
bool:
159+
must:
160+
- term:
161+
name: "rabbit.jpg"
162+
- nested:
163+
path: nested
164+
query:
165+
knn:
166+
field: nested.vector
167+
query_vector: [ -0.5, 90.0, -10, 14.8, -156.0 ]
168+
num_candidates: 1
169+
- match: { hits.total.value: 0 } # no hits because returned single vector did not pass post-filter
170+
171+
- do:
172+
search:
173+
index: test
174+
body:
175+
fields: [ "name" ]
176+
query:
177+
bool:
178+
must:
179+
- term:
180+
name: "rabbit.jpg"
181+
- nested:
182+
path: nested
183+
query:
184+
knn:
185+
field: nested.vector
186+
query_vector: [ -0.5, 90.0, -10, 14.8, -156.0 ]
187+
num_candidates: 3
188+
inner_hits: { size: 1, fields: [ "nested.paragraph_id" ], _source: false }
189+
190+
- match: {hits.total.value: 1}
191+
- match: {hits.hits.0._id: "3"}
192+
- match: {hits.hits.0.fields.name.0: "rabbit.jpg"}
193+
- match: { hits.hits.0.inner_hits.nested.hits.hits.0.fields.nested.0.paragraph_id.0: "0" }

rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search.vectors/46_knn_search_bbq_ivf.yml

Lines changed: 0 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -198,47 +198,6 @@ setup:
198198
index_options:
199199
type: bbq_ivf
200200
---
201-
"Test few dimensions fail indexing":
202-
- do:
203-
catch: bad_request
204-
indices.create:
205-
index: bad_bbq_ivf
206-
body:
207-
mappings:
208-
properties:
209-
vector:
210-
type: dense_vector
211-
dims: 42
212-
index: true
213-
index_options:
214-
type: bbq_ivf
215-
216-
- do:
217-
indices.create:
218-
index: dynamic_dim_bbq_ivf
219-
body:
220-
mappings:
221-
properties:
222-
vector:
223-
type: dense_vector
224-
index: true
225-
similarity: l2_norm
226-
index_options:
227-
type: bbq_ivf
228-
229-
- do:
230-
catch: bad_request
231-
index:
232-
index: dynamic_dim_bbq_ivf
233-
body:
234-
vector: [1.0, 2.0, 3.0, 4.0, 5.0]
235-
236-
- do:
237-
index:
238-
index: dynamic_dim_bbq_ivf
239-
body:
240-
vector: [1.0, 2.0, 3.0, 4.0, 1.0, 2.0, 3.0, 4.0, 1.0, 2.0, 3.0, 4.0, 1.0, 2.0, 3.0, 4.0, 1.0, 2.0, 3.0, 4.0, 1.0, 2.0, 3.0, 4.0, 1.0, 2.0, 3.0, 4.0, 1.0, 2.0, 3.0, 4.0, 1.0, 2.0, 3.0, 4.0, 1.0, 2.0, 3.0, 4.0, 1.0, 2.0, 3.0, 4.0, 1.0, 2.0, 3.0, 4.0, 1.0, 2.0, 3.0, 4.0, 1.0, 2.0, 3.0, 4.0, 1.0, 2.0, 3.0, 4.0, 1.0, 2.0, 3.0, 4.0]
241-
---
242201
"Test index configured rescore vector":
243202
- skip:
244203
features: "headers"

server/src/main/java/org/elasticsearch/index/codec/vectors/IVFVectorsReader.java

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
import org.apache.lucene.index.VectorEncoding;
2323
import org.apache.lucene.index.VectorSimilarityFunction;
2424
import org.apache.lucene.internal.hppc.IntObjectHashMap;
25+
import org.apache.lucene.search.AbstractKnnCollector;
2526
import org.apache.lucene.search.KnnCollector;
2627
import org.apache.lucene.store.ChecksumIndexInput;
2728
import org.apache.lucene.store.DataInput;
@@ -252,7 +253,10 @@ public final void search(String field, float[] target, KnnCollector knnCollector
252253
}
253254
return visitedDocs.getAndSet(docId) == false;
254255
};
256+
assert knnCollector instanceof AbstractKnnCollector;
257+
AbstractKnnCollector knnCollectorImpl = (AbstractKnnCollector) knnCollector;
255258
int nProbe = DYNAMIC_NPROBE;
259+
// Search strategy may be null if this is being called from checkIndex (e.g. from a test)
256260
if (knnCollector.getSearchStrategy() instanceof IVFKnnSearchStrategy ivfSearchStrategy) {
257261
nProbe = ivfSearchStrategy.getNProbe();
258262
}
@@ -280,7 +284,10 @@ public final void search(String field, float[] target, KnnCollector knnCollector
280284
long expectedDocs = 0;
281285
long actualDocs = 0;
282286
// initially we visit only the "centroids to search"
283-
while (centroidQueue.size() > 0 && centroidsVisited < nProbe && actualDocs < knnCollector.k()) {
287+
// Note, numCollected is doing the bare minimum here.
288+
// TODO do we need to handle nested doc counts similarly to how we handle
289+
// filtering? E.g. keep exploring until we hit an expected number of parent documents vs. child vectors?
290+
while (centroidQueue.size() > 0 && (centroidsVisited < nProbe || knnCollectorImpl.numCollected() < knnCollector.k())) {
284291
++centroidsVisited;
285292
// todo do we actually need to know the score???
286293
int centroidOrdinal = centroidQueue.pop();

server/src/main/java/org/elasticsearch/index/mapper/MapperFeatures.java

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ public class MapperFeatures implements FeatureSpecification {
4242
);
4343
static final NodeFeature NPE_ON_DIMS_UPDATE_FIX = new NodeFeature("mapper.npe_on_dims_update_fix");
4444
static final NodeFeature IVF_FORMAT_CLUSTER_FEATURE = new NodeFeature("mapper.ivf_format_cluster_feature");
45+
static final NodeFeature IVF_NESTED_SUPPORT = new NodeFeature("mapper.ivf_nested_support");
4546

4647
@Override
4748
public Set<NodeFeature> getTestFeatures() {
@@ -70,7 +71,8 @@ public Set<NodeFeature> getTestFeatures() {
7071
NPE_ON_DIMS_UPDATE_FIX,
7172
RESCORE_ZERO_VECTOR_QUANTIZED_VECTOR_MAPPING,
7273
USE_DEFAULT_OVERSAMPLE_VALUE_FOR_BBQ,
73-
IVF_FORMAT_CLUSTER_FEATURE
74+
IVF_FORMAT_CLUSTER_FEATURE,
75+
IVF_NESTED_SUPPORT
7476
);
7577
}
7678
}

server/src/main/java/org/elasticsearch/index/mapper/vectors/DenseVectorFieldMapper.java

Lines changed: 13 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,6 @@
6464
import org.elasticsearch.index.mapper.Mapper;
6565
import org.elasticsearch.index.mapper.MapperBuilderContext;
6666
import org.elasticsearch.index.mapper.MapperParsingException;
67-
import org.elasticsearch.index.mapper.MappingLookup;
6867
import org.elasticsearch.index.mapper.MappingParser;
6968
import org.elasticsearch.index.mapper.NumberFieldMapper;
7069
import org.elasticsearch.index.mapper.SimpleMappedFieldType;
@@ -77,6 +76,7 @@
7776
import org.elasticsearch.search.aggregations.support.CoreValuesSourceType;
7877
import org.elasticsearch.search.lookup.Source;
7978
import org.elasticsearch.search.vectors.DenseVectorQuery;
79+
import org.elasticsearch.search.vectors.DiversifyingChildrenIVFKnnFloatVectorQuery;
8080
import org.elasticsearch.search.vectors.ESDiversifyingChildrenByteKnnVectorQuery;
8181
import org.elasticsearch.search.vectors.ESDiversifyingChildrenFloatKnnVectorQuery;
8282
import org.elasticsearch.search.vectors.ESKnnByteVectorQuery;
@@ -1650,7 +1650,7 @@ public boolean supportsElementType(ElementType elementType) {
16501650

16511651
@Override
16521652
public boolean supportsDimension(int dims) {
1653-
return dims >= BBQ_MIN_DIMS;
1653+
return true;
16541654
}
16551655
};
16561656

@@ -2521,12 +2521,19 @@ && isNotUnitVector(squaredMagnitude)) {
25212521
adjustedK = Math.min((int) Math.ceil(k * oversample), OVERSAMPLE_LIMIT);
25222522
numCands = Math.max(adjustedK, numCands);
25232523
}
2524-
if (parentFilter != null && indexOptions instanceof BBQIVFIndexOptions) {
2525-
throw new IllegalArgumentException("IVF index does not support nested queries");
2526-
}
25272524
Query knnQuery;
25282525
if (indexOptions instanceof BBQIVFIndexOptions bbqIndexOptions) {
2529-
knnQuery = new IVFKnnFloatVectorQuery(name(), queryVector, adjustedK, numCands, filter, bbqIndexOptions.defaultNProbe);
2526+
knnQuery = parentFilter != null
2527+
? new DiversifyingChildrenIVFKnnFloatVectorQuery(
2528+
name(),
2529+
queryVector,
2530+
adjustedK,
2531+
numCands,
2532+
filter,
2533+
parentFilter,
2534+
bbqIndexOptions.defaultNProbe
2535+
)
2536+
: new IVFKnnFloatVectorQuery(name(), queryVector, adjustedK, numCands, filter, bbqIndexOptions.defaultNProbe);
25302537
} else {
25312538
knnQuery = parentFilter != null
25322539
? new ESDiversifyingChildrenFloatKnnVectorQuery(
@@ -2769,19 +2776,6 @@ public FieldMapper.Builder getMergeBuilder() {
27692776
return new Builder(leafName(), indexCreatedVersion).init(this);
27702777
}
27712778

2772-
@Override
2773-
public void doValidate(MappingLookup mappers) {
2774-
if (indexOptions instanceof BBQIVFIndexOptions && mappers.nestedLookup().getNestedParent(fullPath()) != null) {
2775-
throw new IllegalArgumentException(
2776-
"["
2777-
+ CONTENT_TYPE
2778-
+ "] fields with index type ["
2779-
+ indexOptions.type
2780-
+ "] cannot be indexed if they're within [nested] mappings"
2781-
);
2782-
}
2783-
}
2784-
27852779
private static IndexOptions parseIndexOptions(String fieldName, Object propNode, IndexVersion indexVersion) {
27862780
@SuppressWarnings("unchecked")
27872781
Map<String, ?> indexOptionsMap = (Map<String, ?>) propNode;

server/src/main/java/org/elasticsearch/search/vectors/AbstractIVFKnnVectorQuery.java

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -169,7 +169,7 @@ abstract TopDocs approximateSearch(
169169
) throws IOException;
170170

171171
protected KnnCollectorManager getKnnCollectorManager(int k, IndexSearcher searcher) {
172-
return new IVFCollectorManager(k, nProbe);
172+
return new IVFCollectorManager(k);
173173
}
174174

175175
@Override
@@ -195,16 +195,14 @@ protected boolean match(int doc) {
195195

196196
static class IVFCollectorManager implements KnnCollectorManager {
197197
private final int k;
198-
private final int nprobe;
199198

200-
IVFCollectorManager(int k, int nprobe) {
199+
IVFCollectorManager(int k) {
201200
this.k = k;
202-
this.nprobe = nprobe;
203201
}
204202

205203
@Override
206204
public KnnCollector newCollector(int visitedLimit, KnnSearchStrategy searchStrategy, LeafReaderContext context) throws IOException {
207-
return new TopKnnCollector(k, visitedLimit, new IVFKnnSearchStrategy(nprobe));
205+
return new TopKnnCollector(k, visitedLimit, searchStrategy);
208206
}
209207
}
210208
}

0 commit comments

Comments
 (0)