Skip to content

Commit ad3903d

Browse files
authored
Adds maxSim functions for multi_dense_vector fields (elastic#116993) (elastic#117203)
This adds `maxSim` functions, specifically dotProduct and InvHamming. Why these two you might ask? Well, they are the best approximations of whats possible with Col* late interaction type models. Effectively, you want a similarity metric where "greater == better". Regular `hamming` isn't exactly that, but inverting that (just like our `element_type: bit` index for dense_vectors), is a nice approximation with bit vectors and multi-vector scoring. Then, of course, dotProduct is another usage. We will allow dot-product between like elements (bytes -> bytes, floats -> floats) and of course, allow `floats -> bit`, where the stored `bit` elements are applied as a "mask" over the float queries. This allows for some nice asymmetric interactions. This is all behind a feature flag, and I need to write a mountain of docs in a separate PR.
1 parent 4640fd9 commit ad3903d

File tree

13 files changed

+1274
-16
lines changed

13 files changed

+1274
-16
lines changed

modules/lang-painless/src/main/resources/org/elasticsearch/painless/org.elasticsearch.script.score.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,5 +50,7 @@ static_import {
5050
double cosineSimilarity(org.elasticsearch.script.ScoreScript, Object, String) bound_to org.elasticsearch.script.VectorScoreScriptUtils$CosineSimilarity
5151
double dotProduct(org.elasticsearch.script.ScoreScript, Object, String) bound_to org.elasticsearch.script.VectorScoreScriptUtils$DotProduct
5252
double hamming(org.elasticsearch.script.ScoreScript, Object, String) bound_to org.elasticsearch.script.VectorScoreScriptUtils$Hamming
53+
double maxSimDotProduct(org.elasticsearch.script.ScoreScript, Object, String) bound_to org.elasticsearch.script.MultiVectorScoreScriptUtils$MaxSimDotProduct
54+
double maxSimInvHamming(org.elasticsearch.script.ScoreScript, Object, String) bound_to org.elasticsearch.script.MultiVectorScoreScriptUtils$MaxSimInvHamming
5355
}
5456

Lines changed: 206 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,206 @@
1+
setup:
2+
- requires:
3+
capabilities:
4+
- method: POST
5+
path: /_search
6+
capabilities: [ multi_dense_vector_script_max_sim ]
7+
test_runner_features: capabilities
8+
reason: "Support for multi dense vector max-sim functions capability required"
9+
- skip:
10+
features: headers
11+
12+
- do:
13+
indices.create:
14+
index: test-index
15+
body:
16+
settings:
17+
number_of_shards: 1
18+
mappings:
19+
properties:
20+
vector:
21+
type: multi_dense_vector
22+
dims: 5
23+
byte_vector:
24+
type: multi_dense_vector
25+
dims: 5
26+
element_type: byte
27+
bit_vector:
28+
type: multi_dense_vector
29+
dims: 40
30+
element_type: bit
31+
- do:
32+
index:
33+
index: test-index
34+
id: "1"
35+
body:
36+
vector: [[230.0, 300.33, -34.8988, 15.555, -200.0], [-0.5, 100.0, -13, 14.8, -156.0]]
37+
byte_vector: [[8, 5, -15, 1, -7], [-1, 115, -3, 4, -128]]
38+
bit_vector: [[8, 5, -15, 1, -7], [-1, 115, -3, 4, -128]]
39+
40+
- do:
41+
index:
42+
index: test-index
43+
id: "3"
44+
body:
45+
vector: [[0.5, 111.3, -13.0, 14.8, -156.0]]
46+
byte_vector: [[2, 18, -5, 0, -124]]
47+
bit_vector: [[2, 18, -5, 0, -124]]
48+
49+
- do:
50+
indices.refresh: {}
51+
---
52+
"Test max-sim dot product scoring":
53+
- skip:
54+
features: close_to
55+
56+
- do:
57+
headers:
58+
Content-Type: application/json
59+
search:
60+
rest_total_hits_as_int: true
61+
body:
62+
query:
63+
script_score:
64+
query: {match_all: {} }
65+
script:
66+
source: "maxSimDotProduct(params.query_vector, 'vector')"
67+
params:
68+
query_vector: [[1, 2, 1, 1, 1]]
69+
70+
- match: {hits.total: 2}
71+
72+
- match: {hits.hits.0._id: "1"}
73+
- close_to: {hits.hits.0._score: {value: 611.316, error: 0.01}}
74+
75+
- match: {hits.hits.1._id: "3"}
76+
- close_to: {hits.hits.1._score: {value: 68.90001, error: 0.01}}
77+
78+
- do:
79+
headers:
80+
Content-Type: application/json
81+
search:
82+
rest_total_hits_as_int: true
83+
body:
84+
query:
85+
script_score:
86+
query: {match_all: {} }
87+
script:
88+
source: "maxSimDotProduct(params.query_vector, 'byte_vector')"
89+
params:
90+
query_vector: [[1, 2, 1, 1, 0]]
91+
92+
- match: {hits.total: 2}
93+
94+
- match: {hits.hits.0._id: "1"}
95+
- close_to: {hits.hits.0._score: {value: 230, error: 0.01}}
96+
97+
- match: {hits.hits.1._id: "3"}
98+
- close_to: {hits.hits.1._score: {value: 33, error: 0.01}}
99+
100+
- do:
101+
headers:
102+
Content-Type: application/json
103+
search:
104+
rest_total_hits_as_int: true
105+
body:
106+
query:
107+
script_score:
108+
query: {match_all: {} }
109+
script:
110+
source: "maxSimDotProduct(params.query_vector, 'bit_vector')"
111+
params:
112+
query_vector: [[1, 2, 1, 1, 0]]
113+
114+
- match: {hits.total: 2}
115+
116+
- match: {hits.hits.0._id: "1"}
117+
- close_to: {hits.hits.0._score: {value: 3, error: 0.01}}
118+
119+
- match: {hits.hits.1._id: "3"}
120+
- close_to: {hits.hits.1._score: {value: 2, error: 0.01}}
121+
122+
# doing max-sim dot product with a vector where the stored bit vectors are used as masks
123+
- do:
124+
headers:
125+
Content-Type: application/json
126+
search:
127+
rest_total_hits_as_int: true
128+
body:
129+
query:
130+
script_score:
131+
query: {match_all: {} }
132+
script:
133+
source: "maxSimDotProduct(params.query_vector, 'bit_vector')"
134+
params:
135+
query_vector: [[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]]
136+
- match: {hits.total: 2}
137+
138+
- match: {hits.hits.0._id: "1"}
139+
- close_to: {hits.hits.0._score: {value: 190, error: 0.01}}
140+
141+
- match: {hits.hits.1._id: "3"}
142+
- close_to: {hits.hits.1._score: {value: 125, error: 0.01}}
143+
---
144+
"Test max-sim inv hamming scoring":
145+
- skip:
146+
features: close_to
147+
148+
# inv hamming doesn't apply to float vectors
149+
- do:
150+
catch: bad_request
151+
headers:
152+
Content-Type: application/json
153+
search:
154+
rest_total_hits_as_int: true
155+
body:
156+
query:
157+
script_score:
158+
query: {match_all: {} }
159+
script:
160+
source: "maxSimInvHamming(params.query_vector, 'vector')"
161+
params:
162+
query_vector: [[1, 2, 1, 1, 1]]
163+
164+
- do:
165+
headers:
166+
Content-Type: application/json
167+
search:
168+
rest_total_hits_as_int: true
169+
body:
170+
query:
171+
script_score:
172+
query: {match_all: {} }
173+
script:
174+
source: "maxSimInvHamming(params.query_vector, 'byte_vector')"
175+
params:
176+
query_vector: [[1, 2, 1, 1, 1]]
177+
178+
- match: {hits.total: 2}
179+
180+
- match: {hits.hits.0._id: "3"}
181+
- close_to: {hits.hits.0._score: {value: 0.675, error: 0.01}}
182+
183+
- match: {hits.hits.1._id: "1"}
184+
- close_to: {hits.hits.1._score: {value: 0.65, error: 0.01}}
185+
186+
- do:
187+
headers:
188+
Content-Type: application/json
189+
search:
190+
rest_total_hits_as_int: true
191+
body:
192+
query:
193+
script_score:
194+
query: {match_all: {} }
195+
script:
196+
source: "maxSimInvHamming(params.query_vector, 'bit_vector')"
197+
params:
198+
query_vector: [[1, 2, 1, 1, 1]]
199+
200+
- match: {hits.total: 2}
201+
202+
- match: {hits.hits.0._id: "3"}
203+
- close_to: {hits.hits.0._score: {value: 0.675, error: 0.01}}
204+
205+
- match: {hits.hits.1._id: "1"}
206+
- close_to: {hits.hits.1._score: {value: 0.65, error: 0.01}}

server/src/main/java/org/elasticsearch/rest/action/search/SearchCapabilities.java

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,8 @@ private SearchCapabilities() {}
3838
private static final String NESTED_RETRIEVER_INNER_HITS_SUPPORT = "nested_retriever_inner_hits_support";
3939
/** Support multi-dense-vector script field access. */
4040
private static final String MULTI_DENSE_VECTOR_SCRIPT_ACCESS = "multi_dense_vector_script_access";
41+
/** Initial support for multi-dense-vector maxSim functions access. */
42+
private static final String MULTI_DENSE_VECTOR_SCRIPT_MAX_SIM = "multi_dense_vector_script_max_sim";
4143

4244
private static final String RANDOM_SAMPLER_WITH_SCORED_SUBAGGS = "random_sampler_with_scored_subaggs";
4345

@@ -53,6 +55,7 @@ private SearchCapabilities() {}
5355
if (MultiDenseVectorFieldMapper.FEATURE_FLAG.isEnabled()) {
5456
capabilities.add(MULTI_DENSE_VECTOR_FIELD_MAPPER);
5557
capabilities.add(MULTI_DENSE_VECTOR_SCRIPT_ACCESS);
58+
capabilities.add(MULTI_DENSE_VECTOR_SCRIPT_MAX_SIM);
5659
}
5760
if (Build.current().isSnapshot()) {
5861
capabilities.add(KQL_QUERY_SUPPORTED);

0 commit comments

Comments
 (0)