Skip to content

Commit e5dc173

Browse files
authored
Apply default k for knn query eagerly (elastic#118774) (elastic#119700)
When originally added, the knn query didn't apply `top-k` restrictions to the query. Instead it would allow the resulting `num_candidate` to be combined with sibling queries without restricting to `top-size` results ahead of time. This honestly is confusing behavior and leads to some bugs in understand how it all works. This commit addresses this by eagerly gathering only `size` results when `k==null` before combining with other queries. To achieve the previous behavior, this can be done directly by setting `k==num_candidates` in the query. (cherry picked from commit c18b48d)
1 parent 40b1e3e commit e5dc173

File tree

19 files changed

+89
-83
lines changed

19 files changed

+89
-83
lines changed

docs/changelog/118774.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
pr: 118774
2+
summary: Apply default k for knn query eagerly
3+
area: Vector Search
4+
type: bug
5+
issues: []

docs/reference/query-dsl/knn-query.asciidoc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@ include::{es-ref-dir}/rest-api/common-parms.asciidoc[tag=knn-query-vector-builde
100100
--
101101
(Optional, integer) The number of nearest neighbors to return from each shard.
102102
{es} collects `k` results from each shard, then merges them to find the global top results.
103-
This value must be less than or equal to `num_candidates`. Defaults to `num_candidates`.
103+
This value must be less than or equal to `num_candidates`. Defaults to search request size.
104104
--
105105

106106
`num_candidates`::

rest-api-spec/build.gradle

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -259,4 +259,20 @@ tasks.named("yamlRestTestV7CompatTransform").configure({ task ->
259259
task.skipTest("search.vectors/42_knn_search_bbq_flat/Test knn search", "Scoring has changed in latest versions")
260260
task.skipTest("synonyms/90_synonyms_reloading_for_synset/Reload analyzers for specific synonym set", "Can't work until auto-expand replicas is 0-1 for synonyms index")
261261
task.skipTest("search/90_search_after/_shard_doc sort", "restriction has been lifted in latest versions")
262+
task.skipTest("search.vectors/180_update_dense_vector_type/Test create and update dense vector mapping with bulk indexing", "waiting for #118774 backport")
263+
task.skipTest("search.vectors/160_knn_query_missing_params/kNN query in a bool clause - missing num_candidates", "waiting for #118774 backport")
264+
task.skipTest("search.vectors/110_knn_query_with_filter/Simple knn query", "waiting for #118774 backport")
265+
task.skipTest("search.vectors/160_knn_query_missing_params/kNN search used in nested field - missing num_candidates", "waiting for #118774 backport")
266+
task.skipTest("search.vectors/180_update_dense_vector_type/Test create and update dense vector mapping to int4 with per-doc indexing and flush", "waiting for #118774 backport")
267+
task.skipTest("search.vectors/110_knn_query_with_filter/PRE_FILTER: knn query with internal filter as pre-filter", "waiting for #118774 backport")
268+
task.skipTest("search.vectors/180_update_dense_vector_type/Index, update and merge", "waiting for #118774 backport")
269+
task.skipTest("search.vectors/160_knn_query_missing_params/kNN query with missing num_candidates param - size provided", "waiting for #118774 backport")
270+
task.skipTest("search.vectors/110_knn_query_with_filter/POST_FILTER: knn query with filter from a parent bool query as post-filter", "waiting for #118774 backport")
271+
task.skipTest("search.vectors/120_knn_query_multiple_shards/Aggregations with collected number of docs depends on num_candidates", "waiting for #118774 backport")
272+
task.skipTest("search.vectors/180_update_dense_vector_type/Test create and update dense vector mapping with per-doc indexing and flush", "waiting for #118774 backport")
273+
task.skipTest("search.vectors/110_knn_query_with_filter/PRE_FILTER: knn query with alias filter as pre-filter", "waiting for #118774 backport")
274+
task.skipTest("search.vectors/140_knn_query_with_other_queries/Function score query with knn query", "waiting for #118774 backport")
275+
task.skipTest("search.vectors/130_knn_query_nested_search/nested kNN search inner_hits size > 1", "waiting for #118774 backport")
276+
task.skipTest("search.vectors/110_knn_query_with_filter/PRE_FILTER: pre-filter across multiple aliases", "waiting for #118774 backport")
277+
task.skipTest("search.vectors/160_knn_query_missing_params/kNN search in a dis_max query - missing num_candidates", "waiting for #118774 backport")
262278
})

rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search.vectors/110_knn_query_with_filter.yml

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,9 @@ setup:
5959

6060
---
6161
"Simple knn query":
62-
62+
- requires:
63+
cluster_features: "search.vectors.k_param_supported"
64+
reason: 'k param for knn as query is required'
6365
- do:
6466
search:
6567
index: my_index
@@ -71,8 +73,9 @@ setup:
7173
field: my_vector
7274
query_vector: [1, 1, 1, 1]
7375
num_candidates: 5
76+
k: 5
7477

75-
- match: { hits.total.value: 5 } # collector sees num_candidates docs
78+
- match: { hits.total.value: 5 }
7679
- length: {hits.hits: 3}
7780
- match: { hits.hits.0._id: "1" }
7881
- match: { hits.hits.0.fields.my_name.0: v1 }
@@ -93,8 +96,9 @@ setup:
9396
field: my_vector
9497
query_vector: [1, 1, 1, 1]
9598
num_candidates: 5
99+
k: 5
96100

97-
- match: { hits.total.value: 5 } # collector sees num_candidates docs
101+
- match: { hits.total.value: 5 }
98102
- length: {hits.hits: 3}
99103
- match: { hits.hits.0._id: "2" }
100104
- match: { hits.hits.0.fields.my_name.0: v2 }
@@ -140,6 +144,7 @@ setup:
140144
field: my_vector
141145
query_vector: [1, 1, 1, 1]
142146
num_candidates: 5
147+
k: 5
143148

144149
- match: { hits.total.value: 5 }
145150
- length: { hits.hits: 3 }
@@ -184,6 +189,7 @@ setup:
184189
field: my_vector
185190
query_vector: [1, 1, 1, 1]
186191
num_candidates: 100
192+
k: 100
187193

188194
- match: { hits.total.value: 10 } # 5 docs from each alias
189195
- length: {hits.hits: 6}
@@ -213,6 +219,7 @@ setup:
213219
field: my_vector
214220
query_vector: [1, 1, 1, 1]
215221
num_candidates: 5
222+
k: 5
216223
filter:
217224
term:
218225
my_name: v2
@@ -243,9 +250,10 @@ setup:
243250
field: my_vector
244251
query_vector: [1, 1, 1, 1]
245252
num_candidates: 5
253+
k: 5
246254

247255
- match: { hits.total.value: 2 }
248-
- length: {hits.hits: 2} # knn query returns top 5 docs, but they are post-filtered to 2 docs
256+
- length: {hits.hits: 2} # knn query returns top 3 docs, but they are post-filtered to 2 docs
249257
- match: { hits.hits.0._id: "2" }
250258
- match: { hits.hits.0.fields.my_name.0: v2 }
251259
- match: { hits.hits.1._id: "4" }
@@ -271,4 +279,4 @@ setup:
271279
my_name: v1
272280

273281
- match: { hits.total.value: 0}
274-
- length: { hits.hits: 0 } # knn query returns top 5 docs, but they are post-filtered to 0 docs
282+
- length: { hits.hits: 0 } # knn query returns top 3 docs, but they are post-filtered to 0 docs

rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search.vectors/120_knn_query_multiple_shards.yml

Lines changed: 0 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -166,55 +166,3 @@ setup:
166166
- close_to: { hits.hits.2._score: { value: 120, error: 0.00001 } }
167167
- close_to: { hits.hits.2.matched_queries.bm25_query: { value: 100.0, error: 0.00001 } }
168168
- close_to: { hits.hits.2.matched_queries.knn_query: { value: 20.0, error: 0.00001 } }
169-
170-
---
171-
"Aggregations with collected number of docs depends on num_candidates":
172-
- do:
173-
search:
174-
index: my_index
175-
body:
176-
size: 2
177-
query:
178-
knn:
179-
field: my_vector
180-
query_vector: [1, 1, 1, 1]
181-
num_candidates: 100 # collect up to 100 candidates from each shard
182-
aggs:
183-
my_agg:
184-
terms:
185-
field: my_name
186-
order:
187-
_key: asc
188-
189-
- length: {hits.hits: 2}
190-
- match: {hits.total.value: 12}
191-
- match: {aggregations.my_agg.buckets.0.key: 'v1'}
192-
- match: {aggregations.my_agg.buckets.1.key: 'v2'}
193-
- match: {aggregations.my_agg.buckets.0.doc_count: 6}
194-
- match: {aggregations.my_agg.buckets.1.doc_count: 6}
195-
196-
- do:
197-
search:
198-
index: my_index
199-
body:
200-
size: 2
201-
query:
202-
knn:
203-
field: my_vector
204-
query_vector: [ 1, 1, 1, 1 ]
205-
num_candidates: 3 # collect 3 candidates from each shard
206-
aggs:
207-
my_agg2:
208-
terms:
209-
field: my_name
210-
order:
211-
_key: asc
212-
my_sum_buckets:
213-
sum_bucket:
214-
buckets_path: "my_agg2>_count"
215-
216-
- length: { hits.hits: 2 }
217-
- match: { hits.total.value: 6 }
218-
- match: { aggregations.my_agg2.buckets.0.key: 'v1' }
219-
- match: { aggregations.my_agg2.buckets.1.key: 'v2' }
220-
- match: { aggregations.my_sum_buckets.value: 6.0 }

rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search.vectors/130_knn_query_nested_search.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -273,6 +273,7 @@ setup:
273273
knn:
274274
field: nested.vector
275275
query_vector: [ -0.5, 90.0, -10, 14.8, -156.0 ]
276+
k: 5
276277
num_candidates: 5
277278
inner_hits: { size: 2, "fields": [ "nested.paragraph_id" ], _source: false }
278279

@@ -295,6 +296,7 @@ setup:
295296
knn:
296297
field: nested.vector
297298
query_vector: [ -0.5, 90.0, -10, 14.8, -156.0 ]
299+
k: 5
298300
num_candidates: 5
299301
inner_hits: { size: 2, "fields": [ "nested.paragraph_id" ], _source: false }
300302

rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search.vectors/140_knn_query_with_other_queries.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,7 @@ setup:
6969
field: my_vector
7070
query_vector: [ 1, 1, 1, 1 ]
7171
num_candidates: 5
72+
k: 5
7273
functions:
7374
- filter: { match: { my_name: v1 } }
7475
weight: 10

rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search.vectors/160_knn_query_missing_params.yml

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -100,8 +100,9 @@ setup:
100100
knn:
101101
field: vector
102102
query_vector: [1, 1, 1]
103+
k: 2
103104
size: 1
104-
- match: { hits.total: 2 } # due to num_candidates defined as round(1.5 * size), so we only see 2 results
105+
- match: { hits.total: 2 } # k defaults to size
105106
- length: { hits.hits: 1 } # one result is only returned though
106107

107108
---
@@ -117,6 +118,7 @@ setup:
117118
field: vector
118119
query_vector: [-1, -1, -1]
119120
num_candidates: 1
121+
k: 1
120122
size: 10
121123

122124
- match: { hits.total: 1 }
@@ -137,9 +139,10 @@ setup:
137139
- knn:
138140
field: vector
139141
query_vector: [ 1, 1, 0]
142+
k: 1
140143
size: 1
141144

142-
- match: { hits.total: 2 } # due to num_candidates defined as round(1.5 * size), so we only see 2 results from cat:A
145+
- match: { hits.total: 1 }
143146
- length: { hits.hits: 1 }
144147

145148
---
@@ -154,6 +157,7 @@ setup:
154157
- knn:
155158
field: vector
156159
query_vector: [1, 1, 0]
160+
k: 2
157161
- match:
158162
category: B
159163
tie_breaker: 0.8
@@ -175,6 +179,7 @@ setup:
175179
knn:
176180
field: nested.vector
177181
query_vector: [ -0.5, 90.0, -10, 14.8, -156.0 ]
182+
k: 2
178183
inner_hits: { size: 1, "fields": [ "nested.paragraph_id" ], _source: false }
179184
size: 1
180185

rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search.vectors/180_update_dense_vector_type.yml

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,7 @@ setup:
109109
field: embedding
110110
query_vector: [1, 1, 1, 1]
111111
num_candidates: 10
112+
k: 10
112113

113114
- match: { hits.total.value: 10 }
114115
- length: {hits.hits: 3}
@@ -215,6 +216,7 @@ setup:
215216
field: embedding
216217
query_vector: [ 1, 1, 1, 1 ]
217218
num_candidates: 20
219+
k: 20
218220

219221
- match: { hits.total.value: 20 }
220222
- length: { hits.hits: 3 }
@@ -322,6 +324,7 @@ setup:
322324
field: embedding
323325
query_vector: [ 1, 1, 1, 1 ]
324326
num_candidates: 30
327+
k: 30
325328

326329
- match: { hits.total.value: 30 }
327330
- length: { hits.hits: 4 }
@@ -430,6 +433,7 @@ setup:
430433
field: embedding
431434
query_vector: [ 1, 1, 1, 1 ]
432435
num_candidates: 40
436+
k: 40
433437

434438
- match: { hits.total.value: 40 }
435439
- length: { hits.hits: 5 }
@@ -499,6 +503,7 @@ setup:
499503
field: embedding
500504
query_vector: [1, 1, 1, 1]
501505
num_candidates: 10
506+
k: 10
502507

503508
- match: { hits.total.value: 10 }
504509
- length: {hits.hits: 3}
@@ -559,6 +564,7 @@ setup:
559564
field: embedding
560565
query_vector: [ 1, 1, 1, 1 ]
561566
num_candidates: 20
567+
k: 20
562568

563569
- match: { hits.total.value: 20 }
564570
- length: { hits.hits: 3 }
@@ -620,6 +626,7 @@ setup:
620626
field: embedding
621627
query_vector: [ 1, 1, 1, 1 ]
622628
num_candidates: 30
629+
k: 30
623630

624631
- match: { hits.total.value: 30 }
625632
- length: { hits.hits: 4 }
@@ -682,6 +689,7 @@ setup:
682689
field: embedding
683690
query_vector: [ 1, 1, 1, 1 ]
684691
num_candidates: 40
692+
k: 40
685693

686694
- match: { hits.total.value: 40 }
687695
- length: { hits.hits: 5 }
@@ -751,6 +759,7 @@ setup:
751759
field: embedding
752760
query_vector: [ 1, 1, 1, 1 ]
753761
num_candidates: 10
762+
k: 10
754763

755764
- match: { hits.total.value: 10 }
756765
- length: { hits.hits: 3 }
@@ -791,6 +800,7 @@ setup:
791800
field: embedding
792801
query_vector: [ 1, 1, 1, 1 ]
793802
num_candidates: 10
803+
k: 10
794804

795805
- match: { hits.total.value: 10 }
796806
- length: { hits.hits: 3 }
@@ -833,6 +843,7 @@ setup:
833843
field: embedding
834844
query_vector: [ 1, 1, 1, 1 ]
835845
num_candidates: 20
846+
k: 20
836847

837848
- match: { hits.total.value: 20 }
838849
- length: { hits.hits: 3 }
@@ -869,6 +880,7 @@ setup:
869880
field: embedding
870881
query_vector: [ 1, 1, 1, 1 ]
871882
num_candidates: 20
883+
k: 20
872884

873885
- match: { hits.total.value: 20 }
874886
- length: { hits.hits: 3 }
@@ -911,6 +923,7 @@ setup:
911923
field: embedding
912924
query_vector: [ 1, 1, 1, 1 ]
913925
num_candidates: 30
926+
k: 30
914927

915928
- match: { hits.total.value: 30 }
916929
- length: { hits.hits: 4 }
@@ -933,6 +946,7 @@ setup:
933946
knn:
934947
field: embedding
935948
query_vector: [ 1, 1, 1, 1 ]
949+
k: 30
936950
num_candidates: 30
937951

938952
- match: { hits.total.value: 30 }
@@ -1769,6 +1783,7 @@ setup:
17691783
field: embedding
17701784
query_vector: [1, 1, 1, 1]
17711785
num_candidates: 10
1786+
k: 10
17721787

17731788
- match: { hits.total.value: 10 }
17741789
- length: {hits.hits: 3}
@@ -1875,6 +1890,7 @@ setup:
18751890
field: embedding
18761891
query_vector: [ 1, 1, 1, 1 ]
18771892
num_candidates: 20
1893+
k: 20
18781894

18791895
- match: { hits.total.value: 20 }
18801896
- length: { hits.hits: 3 }
@@ -1982,6 +1998,7 @@ setup:
19821998
field: embedding
19831999
query_vector: [ 1, 1, 1, 1 ]
19842000
num_candidates: 30
2001+
k: 30
19852002

19862003
- match: { hits.total.value: 30 }
19872004
- length: { hits.hits: 4 }
@@ -2090,6 +2107,7 @@ setup:
20902107
field: embedding
20912108
query_vector: [ 1, 1, 1, 1 ]
20922109
num_candidates: 40
2110+
k: 40
20932111

20942112
- match: { hits.total.value: 40 }
20952113
- length: { hits.hits: 5 }

0 commit comments

Comments
 (0)