Skip to content

Commit 04d1929

Browse files
kderussoelasticmachinejimczi
authored
Allow fields with dots in sparse vector field mapper (#111981)
* Remove dot validation for sparse vector field mapper * Update docs/changelog/111981.yaml * Update changelog * Fix test permissions * PR feedback - yaml test * PR feedback - remove non-dot values from sparse vector query in test to verify the dot is searched correctly * Add additional test cases for field collissions * Update docs/changelog/111981.yaml * Update key for SparseVectorFieldMapper * Fix test * Update yaml test to include scores * Update server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java Co-authored-by: Jim Ferenczi <[email protected]> * Revert "Update server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java" This reverts commit 58fc087. * PR feedback - escape dots --------- Co-authored-by: Elastic Machine <[email protected]> Co-authored-by: Jim Ferenczi <[email protected]>
1 parent 72248e3 commit 04d1929

File tree

4 files changed

+192
-13
lines changed

4 files changed

+192
-13
lines changed

docs/changelog/111981.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
pr: 111981
2+
summary: Allow fields with dots in sparse vector field mapper
3+
area: Mapping
4+
type: enhancement
5+
issues:
6+
- 109118

server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -177,15 +177,11 @@ public void parse(DocumentParserContext context) throws IOException {
177177
for (Token token = context.parser().nextToken(); token != Token.END_OBJECT; token = context.parser().nextToken()) {
178178
if (token == Token.FIELD_NAME) {
179179
feature = context.parser().currentName();
180-
if (feature.contains(".")) {
181-
throw new IllegalArgumentException(
182-
"[sparse_vector] fields do not support dots in feature names but found [" + feature + "]"
183-
);
184-
}
185180
} else if (token == Token.VALUE_NULL) {
186181
// ignore feature, this is consistent with numeric fields
187182
} else if (token == Token.VALUE_NUMBER || token == Token.VALUE_STRING) {
188-
final String key = fullPath() + "." + feature;
183+
// Use a delimiter that won't collide with subfields & escape the dots in the feature name
184+
final String key = fullPath() + "\\." + feature.replace(".", "\\.");
189185
float value = context.parser().floatValue(true);
190186

191187
// if we have an existing feature of the same name we'll select for the one with the max value

server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java

Lines changed: 21 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -111,12 +111,26 @@ public void testDefaults() throws Exception {
111111

112112
public void testDotInFieldName() throws Exception {
113113
DocumentMapper mapper = createDocumentMapper(fieldMapping(this::minimalMapping));
114-
DocumentParsingException ex = expectThrows(
115-
DocumentParsingException.class,
116-
() -> mapper.parse(source(b -> b.field("field", Map.of("politi.cs", 10, "sports", 20))))
117-
);
118-
assertThat(ex.getCause().getMessage(), containsString("do not support dots in feature names"));
119-
assertThat(ex.getCause().getMessage(), containsString("politi.cs"));
114+
ParsedDocument parsedDocument = mapper.parse(source(b -> b.field("field", Map.of("foo.bar", 10, "foobar", 20))));
115+
116+
List<IndexableField> fields = parsedDocument.rootDoc().getFields("field");
117+
assertEquals(2, fields.size());
118+
assertThat(fields.get(0), Matchers.instanceOf(FeatureField.class));
119+
FeatureField featureField1 = null;
120+
FeatureField featureField2 = null;
121+
for (IndexableField field : fields) {
122+
if (field.stringValue().equals("foo.bar")) {
123+
featureField1 = (FeatureField) field;
124+
} else if (field.stringValue().equals("foobar")) {
125+
featureField2 = (FeatureField) field;
126+
} else {
127+
throw new UnsupportedOperationException();
128+
}
129+
}
130+
131+
int freq1 = getFrequency(featureField1.tokenStream(null, null));
132+
int freq2 = getFrequency(featureField2.tokenStream(null, null));
133+
assertTrue(freq1 < freq2);
120134
}
121135

122136
public void testHandlesMultiValuedFields() throws MapperParsingException, IOException {
@@ -156,7 +170,7 @@ public void testHandlesMultiValuedFields() throws MapperParsingException, IOExce
156170
}));
157171

158172
// then validate that the generate document stored both values appropriately and we have only the max value stored
159-
FeatureField barField = ((FeatureField) doc1.rootDoc().getByKey("foo.field.bar"));
173+
FeatureField barField = ((FeatureField) doc1.rootDoc().getByKey("foo.field\\.bar"));
160174
assertEquals(20, barField.getFeatureValue(), 1);
161175

162176
FeatureField storedBarField = ((FeatureField) doc1.rootDoc().getFields("foo.field").get(1));

x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/ml/sparse_vector_search.yml

Lines changed: 163 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -313,3 +313,166 @@ setup:
313313
query: "octopus comforter smells"
314314

315315
- match: { status: 400 }
316+
317+
---
318+
"Search on a sparse_vector field with dots in the field names":
319+
320+
- requires:
321+
cluster_features: [ "gte_v8.16.0" ]
322+
reason: dots in field names allowed starting in in 8.16.0
323+
324+
- do:
325+
indices.create:
326+
index: index-with-sparse-vector2
327+
body:
328+
mappings:
329+
properties:
330+
ml.tokens:
331+
type: sparse_vector
332+
333+
- match: { acknowledged: true }
334+
335+
- do:
336+
headers:
337+
Authorization: "Basic eF9wYWNrX3Jlc3RfdXNlcjp4LXBhY2stdGVzdC1wYXNzd29yZA==" # run as x_pack_rest_user, i.e. the test setup superuser
338+
index:
339+
index: index-with-sparse-vector2
340+
id: "has-dots"
341+
refresh: true
342+
body:
343+
ml:
344+
tokens:
345+
running: 2.4097164
346+
good: 2.170997
347+
run: 2.052153
348+
race: 1.4575411
349+
for: 1.1908325
350+
5.0k: 2.489943
351+
352+
- match: { result: "created" }
353+
354+
- do:
355+
headers:
356+
Authorization: "Basic eF9wYWNrX3Jlc3RfdXNlcjp4LXBhY2stdGVzdC1wYXNzd29yZA==" # run as x_pack_rest_user, i.e. the test setup superuser
357+
get:
358+
index: index-with-sparse-vector2
359+
id: "has-dots"
360+
361+
- match:
362+
_source:
363+
ml:
364+
tokens:
365+
running: 2.4097164
366+
good: 2.170997
367+
run: 2.052153
368+
race: 1.4575411
369+
for: 1.1908325
370+
5.0k: 2.489943
371+
372+
- do:
373+
search:
374+
index: index-with-sparse-vector2
375+
body:
376+
query:
377+
sparse_vector:
378+
field: ml.tokens
379+
query_vector:
380+
5.0k: 2.489943
381+
382+
- match: { hits.total.value: 1 }
383+
384+
---
385+
"Search on a nested sparse_vector field with dots in the field names and conflicting child fields":
386+
387+
- requires:
388+
cluster_features: [ "gte_v8.16.0" ]
389+
reason: dots in field names allowed starting in in 8.16.0
390+
391+
- do:
392+
indices.create:
393+
index: index-with-sparse-vector3
394+
body:
395+
mappings:
396+
properties:
397+
parent:
398+
type: object
399+
subobjects: false
400+
properties:
401+
foo:
402+
type: sparse_vector
403+
foo.bar:
404+
type: sparse_vector
405+
406+
- match: { acknowledged: true }
407+
408+
- do:
409+
headers:
410+
Authorization: "Basic eF9wYWNrX3Jlc3RfdXNlcjp4LXBhY2stdGVzdC1wYXNzd29yZA==" # run as x_pack_rest_user, i.e. the test setup superuser
411+
Content-Type: application/json
412+
bulk:
413+
index: index-with-sparse-vector3
414+
refresh: true
415+
body: |
416+
{"index": { "_id": "parent-foo" }}
417+
{"parent.foo": { "bar.baz": 1.0 }}
418+
{"index": { "_id": "parent-foo-bar" }}
419+
{"parent.foo.bar": { "baz": 2.0 }}
420+
{"index": { "_id": "both-docs" }}
421+
{"parent.foo": { "bar.baz": 3.0 }, "parent.foo.bar": { "baz": 4.0 }}
422+
423+
424+
- do:
425+
headers:
426+
Authorization: "Basic eF9wYWNrX3Jlc3RfdXNlcjp4LXBhY2stdGVzdC1wYXNzd29yZA==" # run as x_pack_rest_user, i.e. the test setup superuser
427+
get:
428+
index: index-with-sparse-vector3
429+
id: "parent-foo"
430+
431+
- match:
432+
_source:
433+
parent.foo:
434+
bar.baz: 1.0
435+
436+
- do:
437+
headers:
438+
Authorization: "Basic eF9wYWNrX3Jlc3RfdXNlcjp4LXBhY2stdGVzdC1wYXNzd29yZA==" # run as x_pack_rest_user, i.e. the test setup superuser
439+
get:
440+
index: index-with-sparse-vector3
441+
id: "parent-foo-bar"
442+
443+
- match:
444+
_source:
445+
parent.foo.bar:
446+
baz: 2.0
447+
448+
- do:
449+
search:
450+
index: index-with-sparse-vector3
451+
body:
452+
query:
453+
sparse_vector:
454+
field: parent.foo
455+
query_vector:
456+
bar.baz: 1.0
457+
458+
- match: { hits.total.value: 2 }
459+
- match: { hits.hits.0._id: "both-docs" }
460+
- match: { hits.hits.0._score: 3.0 }
461+
- match: { hits.hits.1._id: "parent-foo" }
462+
- match: { hits.hits.1._score: 1.0 }
463+
464+
- do:
465+
search:
466+
index: index-with-sparse-vector3
467+
body:
468+
query:
469+
sparse_vector:
470+
field: parent.foo.bar
471+
query_vector:
472+
baz: 1.0
473+
474+
- match: { hits.total.value: 2 }
475+
- match: { hits.hits.0._id: "both-docs" }
476+
- match: { hits.hits.0._score: 4.0 }
477+
- match: { hits.hits.1._id: "parent-foo-bar" }
478+
- match: { hits.hits.1._score: 2.0 }

0 commit comments

Comments
 (0)