Skip to content

Commit 6d81ff9

Browse files
authored
Add synthetic vectors support for sparse_vector (elastic#130756)
This change adds the support for synthetic vectors (added in elastic#130382) in the sparse_vector field type.
1 parent 83076c2 commit 6d81ff9

File tree

9 files changed

+511
-84
lines changed

9 files changed

+511
-84
lines changed
Lines changed: 380 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,380 @@
1+
setup:
2+
- requires:
3+
reason: 'synthetic vectors are required'
4+
test_runner_features: [ capabilities ]
5+
capabilities:
6+
- method: GET
7+
path: /_search
8+
capabilities: [ synthetic_vectors_setting ]
9+
- skip:
10+
features: "headers"
11+
12+
- do:
13+
indices.create:
14+
index: test
15+
body:
16+
settings:
17+
index.mapping.synthetic_vectors: true
18+
mappings:
19+
properties:
20+
name:
21+
type: keyword
22+
emb:
23+
type: sparse_vector
24+
25+
nested:
26+
type: nested
27+
properties:
28+
paragraph_id:
29+
type: keyword
30+
emb:
31+
type: sparse_vector
32+
33+
- do:
34+
index:
35+
index: test
36+
id: "1"
37+
body:
38+
name: cow.jpg
39+
emb:
40+
token_1: 2.0
41+
token_2: 3.0
42+
43+
- do:
44+
index:
45+
index: test
46+
id: "2"
47+
body:
48+
name: moose.jpg
49+
nested:
50+
- paragraph_id: 0
51+
emb:
52+
token_1: 2.0
53+
token_2: 3.0
54+
- paragraph_id: 2
55+
emb:
56+
token_3: 2.0
57+
token_2: 3.0
58+
- paragraph_id: 3
59+
emb:
60+
token_3: 2.0
61+
token_7: 3.0
62+
token_1: 4.0
63+
64+
- do:
65+
index:
66+
index: test
67+
id: "3"
68+
body:
69+
name: rabbit.jpg
70+
emb:
71+
token_3: 2.0
72+
token_9: 3.0
73+
token_2: 4.0
74+
75+
- do:
76+
index:
77+
index: test
78+
id: "4"
79+
body:
80+
name: zoolander.jpg
81+
nested:
82+
- paragraph_id: 0
83+
emb:
84+
token_3: 2.0
85+
token_7: 3.0
86+
token_1: 4.0
87+
- paragraph_id: 1
88+
- paragraph_id: 2
89+
emb:
90+
token_8: 2.0
91+
92+
- do:
93+
indices.refresh: {}
94+
95+
---
96+
"exclude synthetic vectors":
97+
- do:
98+
search:
99+
index: test
100+
body:
101+
sort: ["name"]
102+
103+
- match: { hits.hits.0._id: "1"}
104+
- match: { hits.hits.0._source.name: "cow.jpg"}
105+
- not_exists: hits.hits.0._source.emb
106+
107+
- match: { hits.hits.1._id: "2"}
108+
- match: { hits.hits.1._source.name: "moose.jpg"}
109+
- length: { hits.hits.1._source.nested: 3 }
110+
- not_exists: hits.hits.1._source.nested.0.emb
111+
- match: { hits.hits.1._source.nested.0.paragraph_id: 0 }
112+
- not_exists: hits.hits.1._source.nested.1.emb
113+
- match: { hits.hits.1._source.nested.1.paragraph_id: 2 }
114+
- not_exists: hits.hits.1._source.nested.2.emb
115+
- match: { hits.hits.1._source.nested.2.paragraph_id: 3 }
116+
117+
- match: { hits.hits.2._id: "3" }
118+
- match: { hits.hits.2._source.name: "rabbit.jpg" }
119+
- not_exists: hits.hits.2._source.emb
120+
121+
- match: { hits.hits.3._id: "4" }
122+
- match: { hits.hits.3._source.name: "zoolander.jpg" }
123+
- length: { hits.hits.3._source.nested: 3 }
124+
- not_exists: hits.hits.3._source.nested.0.emb
125+
- match: { hits.hits.3._source.nested.0.paragraph_id: 0 }
126+
- match: { hits.hits.3._source.nested.1.paragraph_id: 1 }
127+
- not_exists: hits.hits.3._source.nested.2.emb
128+
- match: { hits.hits.3._source.nested.2.paragraph_id: 2 }
129+
130+
---
131+
"include synthetic vectors":
132+
- do:
133+
search:
134+
index: test
135+
body:
136+
_source:
137+
exclude_vectors: false
138+
sort: ["name"]
139+
140+
- match: { hits.hits.0._id: "1"}
141+
- match: { hits.hits.0._source.name: "cow.jpg"}
142+
- exists: hits.hits.0._source.emb
143+
144+
- match: { hits.hits.1._id: "2"}
145+
- match: { hits.hits.1._source.name: "moose.jpg"}
146+
- length: { hits.hits.1._source.nested: 3 }
147+
- exists: hits.hits.1._source.nested.0.emb
148+
- match: { hits.hits.1._source.nested.0.paragraph_id: 0 }
149+
- exists: hits.hits.1._source.nested.1.emb
150+
- match: { hits.hits.1._source.nested.1.paragraph_id: 2 }
151+
- exists: hits.hits.1._source.nested.2.emb
152+
- match: { hits.hits.1._source.nested.2.paragraph_id: 3 }
153+
154+
- match: { hits.hits.2._id: "3" }
155+
- match: { hits.hits.2._source.name: "rabbit.jpg" }
156+
- exists: hits.hits.2._source.emb
157+
158+
- match: { hits.hits.3._id: "4" }
159+
- match: { hits.hits.3._source.name: "zoolander.jpg" }
160+
- length: { hits.hits.3._source.nested: 3 }
161+
- exists: hits.hits.3._source.nested.0.emb
162+
- length: { hits.hits.3._source.nested.0.emb: 3 }
163+
- match: { hits.hits.3._source.nested.0.paragraph_id: 0 }
164+
165+
- do:
166+
search:
167+
index: test
168+
body:
169+
_source:
170+
exclude_vectors: false
171+
includes: nested.emb
172+
sort: ["name"]
173+
174+
- match: { hits.hits.0._id: "1"}
175+
- length: { hits.hits.0._source: 0}
176+
177+
- match: { hits.hits.1._id: "2"}
178+
- length: { hits.hits.3._source: 1 }
179+
- length: { hits.hits.1._source.nested: 3 }
180+
- exists: hits.hits.1._source.nested.0.emb
181+
- not_exists: hits.hits.1._source.nested.0.paragraph_id
182+
- exists: hits.hits.1._source.nested.1.emb
183+
- not_exists: hits.hits.1._source.nested.1.paragraph_id
184+
- exists: hits.hits.1._source.nested.2.emb
185+
- not_exists: hits.hits.1._source.nested.2.paragraph_id
186+
187+
- match: { hits.hits.2._id: "3" }
188+
- length: { hits.hits.2._source: 0}
189+
190+
- match: { hits.hits.3._id: "4" }
191+
- length: { hits.hits.3._source: 1 }
192+
- length: { hits.hits.3._source.nested: 2 }
193+
- exists: hits.hits.3._source.nested.0.emb
194+
- length: { hits.hits.3._source.nested.0.emb: 3 }
195+
- not_exists: hits.hits.3._source.nested.0.paragraph_id
196+
- exists: hits.hits.3._source.nested.1.emb
197+
- length: { hits.hits.3._source.nested.1.emb: 1 }
198+
- not_exists: hits.hits.3._source.nested.1.paragraph_id
199+
200+
- do:
201+
headers:
202+
# Force JSON content type so that we use a parser that interprets the embeddings as doubles
203+
Content-Type: application/json
204+
search:
205+
index: test
206+
body:
207+
_source:
208+
exclude_vectors: true
209+
sort: ["name"]
210+
fields: ["emb"]
211+
212+
- match: { hits.hits.0._id: "1"}
213+
- match: { hits.hits.0._source.name: "cow.jpg"}
214+
- not_exists: hits.hits.0._source.emb
215+
- length: { hits.hits.0.fields.emb: 1}
216+
- length: { hits.hits.0.fields.emb.0: 2}
217+
- match: { hits.hits.0.fields.emb.0.token_1: 2.0}
218+
- match: { hits.hits.0.fields.emb.0.token_2: 3.0}
219+
220+
- match: { hits.hits.1._id: "2"}
221+
- match: { hits.hits.1._source.name: "moose.jpg"}
222+
- length: { hits.hits.1._source.nested: 3 }
223+
- not_exists: hits.hits.1._source.nested.0.emb
224+
225+
- match: { hits.hits.2._id: "3" }
226+
- match: { hits.hits.2._source.name: "rabbit.jpg" }
227+
- length: { hits.hits.2.fields.emb: 1}
228+
- length: { hits.hits.2.fields.emb.0: 3}
229+
- match: { hits.hits.2.fields.emb.0.token_2: 4.0}
230+
- match: { hits.hits.2.fields.emb.0.token_3: 2.0}
231+
- match: { hits.hits.2.fields.emb.0.token_9: 3.0}
232+
233+
- match: { hits.hits.3._id: "4" }
234+
- match: { hits.hits.3._source.name: "zoolander.jpg" }
235+
- length: { hits.hits.3._source.nested: 3 }
236+
- not_exists: hits.hits.3._source.nested.0.emb
237+
238+
239+
---
240+
"Bulk partial update with synthetic vectors":
241+
- do:
242+
headers:
243+
# Force JSON content type so that we use a parser that interprets the embeddings as doubles
244+
Content-Type: application/json
245+
bulk:
246+
index: test
247+
_source: true
248+
body:
249+
- '{"update": {"_id": "4"}}'
250+
- >
251+
{
252+
"doc": {
253+
"name": "zoolander2.jpg",
254+
"emb": {
255+
"token_12": 2.0,
256+
"token_13": 1.0
257+
}
258+
}
259+
}
260+
261+
- length: { items.0.update.get._source.emb: 2 }
262+
- match: { items.0.update.get._source.emb.token_12: 2.0 }
263+
- match: { items.0.update.get._source.emb.token_13: 1.0 }
264+
- exists: items.0.update.get._source.nested
265+
- length: { items.0.update.get._source.nested: 3}
266+
- exists: items.0.update.get._source.nested.0.emb
267+
- match: { items.0.update.get._source.nested.0.paragraph_id: 0 }
268+
- length: { items.0.update.get._source.nested.0.emb: 3 }
269+
- not_exists: items.0.update.get._source.nested.1.emb
270+
- match: { items.0.update.get._source.nested.1.paragraph_id: 1 }
271+
- exists: items.0.update.get._source.nested.2.emb
272+
- length: { items.0.update.get._source.nested.2.emb: 1 }
273+
- match: { items.0.update.get._source.nested.2.paragraph_id: 2 }
274+
- set: { items.0.update.get._source.nested: original_nested }
275+
276+
- do:
277+
headers:
278+
# Force JSON content type so that we use a parser that interprets the embeddings as doubles
279+
Content-Type: application/json
280+
get:
281+
_source_exclude_vectors: false
282+
index: test
283+
id: "4"
284+
285+
- match: { _source.name: zoolander2.jpg }
286+
- length: { _source.emb: 2 }
287+
- match: { _source.emb.token_12: 2.0 }
288+
- match: { _source.emb.token_13: 1.0 }
289+
- match: { _source.nested: $original_nested }
290+
291+
- do:
292+
indices.refresh: {}
293+
294+
- do:
295+
headers:
296+
# Force JSON content type so that we use a parser that interprets the embeddings as doubles
297+
Content-Type: application/json
298+
search:
299+
index: test
300+
body:
301+
_source:
302+
"exclude_vectors": false
303+
query:
304+
term:
305+
_id: 4
306+
307+
- match: { hits.total.value: 1 }
308+
- match: { hits.total.relation: eq }
309+
- match: { hits.hits.0._source.name: zoolander2.jpg }
310+
- match: { hits.hits.0._source.nested: $original_nested }
311+
312+
---
313+
"Partial update with synthetic vectors":
314+
- do:
315+
headers:
316+
# Force JSON content type so that we use a parser that interprets the vectors as doubles
317+
Content-Type: application/json
318+
update:
319+
index: test
320+
id: "4"
321+
body:
322+
_source: true
323+
doc: {
324+
"name": "zoolander3.jpg",
325+
"emb": {
326+
"token_3": 2.0,
327+
"token_9": 2.5
328+
}
329+
}
330+
331+
- length: { get._source.emb: 2 }
332+
- match: { get._source.emb.token_3: 2.0 }
333+
- match: { get._source.emb.token_9: 2.5 }
334+
- exists: get._source.nested
335+
- length: { get._source.nested: 3}
336+
- exists: get._source.nested.0.emb
337+
- match: { get._source.nested.0.paragraph_id: 0 }
338+
- length: { get._source.nested.0.emb: 3 }
339+
- not_exists: get._source.nested.1.emb
340+
- match: { get._source.nested.1.paragraph_id: 1 }
341+
- exists: get._source.nested.2.emb
342+
- length: { get._source.nested.2.emb: 1 }
343+
- match: { get._source.nested.2.paragraph_id: 2 }
344+
- set: { get._source.nested: original_nested }
345+
346+
- do:
347+
headers:
348+
# Force JSON content type so that we use a parser that interprets the vectors as doubles
349+
Content-Type: application/json
350+
get:
351+
_source_exclude_vectors: false
352+
index: test
353+
id: "4"
354+
355+
- length: { _source.emb: 2 }
356+
- match: { _source.emb.token_3: 2.0 }
357+
- match: { _source.emb.token_9: 2.5 }
358+
- match: { _source.name: zoolander3.jpg }
359+
- match: { _source.nested: $original_nested }
360+
361+
- do:
362+
indices.refresh: {}
363+
364+
- do:
365+
headers:
366+
# Force JSON content type so that we use a parser that interprets the vectors as doubles
367+
Content-Type: application/json
368+
search:
369+
index: test
370+
body:
371+
_source:
372+
"exclude_vectors": false
373+
query:
374+
term:
375+
_id: 4
376+
377+
- match: { hits.total.value: 1 }
378+
- match: { hits.total.relation: eq }
379+
- match: { hits.hits.0._source.name: zoolander3.jpg }
380+
- match: { hits.hits.0._source.nested: $original_nested }

0 commit comments

Comments
 (0)