Skip to content

Commit e130459

Browse files
authored
Add option to store sparse_vector outside _source (#117917) (#118018)
This PR introduces an option for `sparse_vector` to store its values separately from `_source` by using term vectors. This capability is primarly needed by the semantic text field.
1 parent 171d7c4 commit e130459

File tree

9 files changed

+589
-29
lines changed

9 files changed

+589
-29
lines changed

docs/changelog/117917.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
pr: 117917
2+
summary: Add option to store `sparse_vector` outside `_source`
3+
area: Mapping
4+
type: feature
5+
issues: []

docs/reference/mapping/types/sparse-vector.asciidoc

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,23 @@ PUT my-index
2626

2727
See <<semantic-search-elser, semantic search with ELSER>> for a complete example on adding documents to a `sparse_vector` mapped field using ELSER.
2828

29+
[[sparse-vectors-params]]
30+
==== Parameters for `sparse_vector` fields
31+
32+
The following parameters are accepted by `sparse_vector` fields:
33+
34+
[horizontal]
35+
36+
<<mapping-store,store>>::
37+
38+
Indicates whether the field value should be stored and retrievable independently of the <<mapping-source-field,_source>> field.
39+
Accepted values: true or false (default).
40+
The field's data is stored using term vectors, a disk-efficient structure compared to the original JSON input.
41+
The input map can be retrieved during a search request via the <<search-fields-param,`fields` parameter>>.
42+
To benefit from reduced disk usage, you must either:
43+
* Exclude the field from <<source-filtering, _source>>.
44+
* Use <<synthetic-source,synthetic `_source`>>.
45+
2946
[[index-multi-value-sparse-vectors]]
3047
==== Multi-value sparse vectors
3148

rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search.vectors/90_sparse_vector.yml

Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -472,3 +472,120 @@
472472

473473
- match:
474474
_source.ml.tokens: {}
475+
476+
---
477+
"stored sparse_vector":
478+
479+
- requires:
480+
cluster_features: [ "mapper.sparse_vector.store_support" ]
481+
reason: "sparse_vector supports store parameter"
482+
483+
- do:
484+
indices.create:
485+
index: test
486+
body:
487+
mappings:
488+
properties:
489+
ml.tokens:
490+
type: sparse_vector
491+
store: true
492+
493+
- match: { acknowledged: true }
494+
- do:
495+
index:
496+
index: test
497+
id: "1"
498+
body:
499+
ml:
500+
tokens:
501+
running: 2
502+
good: 3
503+
run: 5
504+
race: 7
505+
for: 9
506+
507+
- match: { result: "created" }
508+
509+
- do:
510+
indices.refresh: { }
511+
512+
- do:
513+
search:
514+
index: test
515+
body:
516+
fields: [ "ml.tokens" ]
517+
518+
- length: { hits.hits.0.fields.ml\\.tokens: 1 }
519+
- length: { hits.hits.0.fields.ml\\.tokens.0: 5 }
520+
- match: { hits.hits.0.fields.ml\\.tokens.0.running: 2.0 }
521+
- match: { hits.hits.0.fields.ml\\.tokens.0.good: 3.0 }
522+
- match: { hits.hits.0.fields.ml\\.tokens.0.run: 5.0 }
523+
- match: { hits.hits.0.fields.ml\\.tokens.0.race: 7.0 }
524+
- match: { hits.hits.0.fields.ml\\.tokens.0.for: 9.0 }
525+
526+
---
527+
"stored sparse_vector synthetic source":
528+
529+
- requires:
530+
cluster_features: [ "mapper.source.mode_from_index_setting", "mapper.sparse_vector.store_support" ]
531+
reason: "sparse_vector supports store parameter"
532+
533+
- do:
534+
indices.create:
535+
index: test
536+
body:
537+
settings:
538+
index:
539+
mapping.source.mode: synthetic
540+
mappings:
541+
properties:
542+
ml.tokens:
543+
type: sparse_vector
544+
store: true
545+
546+
- match: { acknowledged: true }
547+
548+
- do:
549+
index:
550+
index: test
551+
id: "1"
552+
body:
553+
ml:
554+
tokens:
555+
running: 2
556+
good: 3
557+
run: 5
558+
race: 7
559+
for: 9
560+
561+
- match: { result: "created" }
562+
563+
- do:
564+
indices.refresh: { }
565+
566+
- do:
567+
search:
568+
index: test
569+
body:
570+
fields: [ "ml.tokens" ]
571+
572+
- match:
573+
hits.hits.0._source: {
574+
ml: {
575+
tokens: {
576+
running: 2.0,
577+
good: 3.0,
578+
run: 5.0,
579+
race: 7.0,
580+
for: 9.0
581+
}
582+
}
583+
}
584+
585+
- length: { hits.hits.0.fields.ml\\.tokens: 1 }
586+
- length: { hits.hits.0.fields.ml\\.tokens.0: 5 }
587+
- match: { hits.hits.0.fields.ml\\.tokens.0.running: 2.0 }
588+
- match: { hits.hits.0.fields.ml\\.tokens.0.good: 3.0 }
589+
- match: { hits.hits.0.fields.ml\\.tokens.0.run: 5.0 }
590+
- match: { hits.hits.0.fields.ml\\.tokens.0.race: 7.0 }
591+
- match: { hits.hits.0.fields.ml\\.tokens.0.for: 9.0 }

server/src/main/java/org/elasticsearch/index/mapper/MapperFeatures.java

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ public Set<NodeFeature> getFeatures() {
5656
);
5757

5858
public static final NodeFeature META_FETCH_FIELDS_ERROR_CODE_CHANGED = new NodeFeature("meta_fetch_fields_error_code_changed");
59+
public static final NodeFeature SPARSE_VECTOR_STORE_SUPPORT = new NodeFeature("mapper.sparse_vector.store_support");
5960

6061
@Override
6162
public Set<NodeFeature> getTestFeatures() {
@@ -68,7 +69,8 @@ public Set<NodeFeature> getTestFeatures() {
6869
MapperService.LOGSDB_DEFAULT_IGNORE_DYNAMIC_BEYOND_LIMIT,
6970
DocumentParser.FIX_PARSING_SUBOBJECTS_FALSE_DYNAMIC_FALSE,
7071
CONSTANT_KEYWORD_SYNTHETIC_SOURCE_WRITE_FIX,
71-
META_FETCH_FIELDS_ERROR_CODE_CHANGED
72+
META_FETCH_FIELDS_ERROR_CODE_CHANGED,
73+
SPARSE_VECTOR_STORE_SUPPORT
7274
);
7375
}
7476
}

server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java

Lines changed: 147 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,12 @@
1111

1212
import org.apache.lucene.document.FeatureField;
1313
import org.apache.lucene.index.IndexableField;
14+
import org.apache.lucene.index.LeafReader;
15+
import org.apache.lucene.index.LeafReaderContext;
16+
import org.apache.lucene.index.PostingsEnum;
17+
import org.apache.lucene.index.TermVectors;
18+
import org.apache.lucene.index.TermsEnum;
19+
import org.apache.lucene.search.DocIdSetIterator;
1420
import org.apache.lucene.search.MatchNoDocsQuery;
1521
import org.apache.lucene.search.Query;
1622
import org.apache.lucene.util.BytesRef;
@@ -25,14 +31,22 @@
2531
import org.elasticsearch.index.mapper.FieldMapper;
2632
import org.elasticsearch.index.mapper.MappedFieldType;
2733
import org.elasticsearch.index.mapper.MapperBuilderContext;
34+
import org.elasticsearch.index.mapper.SourceLoader;
2835
import org.elasticsearch.index.mapper.SourceValueFetcher;
2936
import org.elasticsearch.index.mapper.TextSearchInfo;
3037
import org.elasticsearch.index.mapper.ValueFetcher;
3138
import org.elasticsearch.index.query.SearchExecutionContext;
39+
import org.elasticsearch.search.fetch.StoredFieldsSpec;
40+
import org.elasticsearch.search.lookup.Source;
41+
import org.elasticsearch.xcontent.XContentBuilder;
3242
import org.elasticsearch.xcontent.XContentParser.Token;
3343

3444
import java.io.IOException;
45+
import java.io.UncheckedIOException;
46+
import java.util.LinkedHashMap;
47+
import java.util.List;
3548
import java.util.Map;
49+
import java.util.stream.Stream;
3650

3751
import static org.elasticsearch.index.query.AbstractQueryBuilder.DEFAULT_BOOST;
3852

@@ -52,8 +66,12 @@ public class SparseVectorFieldMapper extends FieldMapper {
5266
static final IndexVersion NEW_SPARSE_VECTOR_INDEX_VERSION = IndexVersions.NEW_SPARSE_VECTOR;
5367
static final IndexVersion SPARSE_VECTOR_IN_FIELD_NAMES_INDEX_VERSION = IndexVersions.SPARSE_VECTOR_IN_FIELD_NAMES_SUPPORT;
5468

55-
public static class Builder extends FieldMapper.Builder {
69+
private static SparseVectorFieldMapper toType(FieldMapper in) {
70+
return (SparseVectorFieldMapper) in;
71+
}
5672

73+
public static class Builder extends FieldMapper.Builder {
74+
private final Parameter<Boolean> stored = Parameter.storeParam(m -> toType(m).fieldType().isStored(), false);
5775
private final Parameter<Map<String, String>> meta = Parameter.metaParam();
5876

5977
public Builder(String name) {
@@ -62,14 +80,14 @@ public Builder(String name) {
6280

6381
@Override
6482
protected Parameter<?>[] getParameters() {
65-
return new Parameter<?>[] { meta };
83+
return new Parameter<?>[] { stored, meta };
6684
}
6785

6886
@Override
6987
public SparseVectorFieldMapper build(MapperBuilderContext context) {
7088
return new SparseVectorFieldMapper(
7189
leafName(),
72-
new SparseVectorFieldType(context.buildFullName(leafName()), meta.getValue()),
90+
new SparseVectorFieldType(context.buildFullName(leafName()), stored.getValue(), meta.getValue()),
7391
builderParams(this, context)
7492
);
7593
}
@@ -87,8 +105,8 @@ public SparseVectorFieldMapper build(MapperBuilderContext context) {
87105

88106
public static final class SparseVectorFieldType extends MappedFieldType {
89107

90-
public SparseVectorFieldType(String name, Map<String, String> meta) {
91-
super(name, true, false, false, TextSearchInfo.SIMPLE_MATCH_ONLY, meta);
108+
public SparseVectorFieldType(String name, boolean isStored, Map<String, String> meta) {
109+
super(name, true, isStored, false, TextSearchInfo.SIMPLE_MATCH_ONLY, meta);
92110
}
93111

94112
@Override
@@ -103,6 +121,9 @@ public IndexFieldData.Builder fielddataBuilder(FieldDataContext fieldDataContext
103121

104122
@Override
105123
public ValueFetcher valueFetcher(SearchExecutionContext context, String format) {
124+
if (isStored()) {
125+
return new SparseVectorValueFetcher(name());
126+
}
106127
return SourceValueFetcher.identity(name(), context, format);
107128
}
108129

@@ -135,6 +156,14 @@ private SparseVectorFieldMapper(String simpleName, MappedFieldType mappedFieldTy
135156
super(simpleName, mappedFieldType, builderParams);
136157
}
137158

159+
@Override
160+
protected SyntheticSourceSupport syntheticSourceSupport() {
161+
if (fieldType().isStored()) {
162+
return new SyntheticSourceSupport.Native(new SparseVectorSyntheticFieldLoader(fullPath(), leafName()));
163+
}
164+
return super.syntheticSourceSupport();
165+
}
166+
138167
@Override
139168
public Map<String, NamedAnalyzer> indexAnalyzers() {
140169
return Map.of(mappedFieldType.name(), Lucene.KEYWORD_ANALYZER);
@@ -189,9 +218,9 @@ public void parse(DocumentParserContext context) throws IOException {
189218
// based on recommendations from this paper: https://arxiv.org/pdf/2305.18494.pdf
190219
IndexableField currentField = context.doc().getByKey(key);
191220
if (currentField == null) {
192-
context.doc().addWithKey(key, new FeatureField(fullPath(), feature, value));
193-
} else if (currentField instanceof FeatureField && ((FeatureField) currentField).getFeatureValue() < value) {
194-
((FeatureField) currentField).setFeatureValue(value);
221+
context.doc().addWithKey(key, new XFeatureField(fullPath(), feature, value, fieldType().isStored()));
222+
} else if (currentField instanceof XFeatureField && ((XFeatureField) currentField).getFeatureValue() < value) {
223+
((XFeatureField) currentField).setFeatureValue(value);
195224
}
196225
} else {
197226
throw new IllegalArgumentException(
@@ -219,4 +248,114 @@ protected String contentType() {
219248
return CONTENT_TYPE;
220249
}
221250

251+
private static class SparseVectorValueFetcher implements ValueFetcher {
252+
private final String fieldName;
253+
private TermVectors termVectors;
254+
255+
private SparseVectorValueFetcher(String fieldName) {
256+
this.fieldName = fieldName;
257+
}
258+
259+
@Override
260+
public void setNextReader(LeafReaderContext context) {
261+
try {
262+
termVectors = context.reader().termVectors();
263+
} catch (IOException exc) {
264+
throw new UncheckedIOException(exc);
265+
}
266+
}
267+
268+
@Override
269+
public List<Object> fetchValues(Source source, int doc, List<Object> ignoredValues) throws IOException {
270+
if (termVectors == null) {
271+
return List.of();
272+
}
273+
var terms = termVectors.get(doc, fieldName);
274+
if (terms == null) {
275+
return List.of();
276+
}
277+
278+
var termsEnum = terms.iterator();
279+
PostingsEnum postingsScratch = null;
280+
Map<String, Float> result = new LinkedHashMap<>();
281+
while (termsEnum.next() != null) {
282+
postingsScratch = termsEnum.postings(postingsScratch);
283+
postingsScratch.nextDoc();
284+
result.put(termsEnum.term().utf8ToString(), XFeatureField.decodeFeatureValue(postingsScratch.freq()));
285+
assert postingsScratch.nextDoc() == DocIdSetIterator.NO_MORE_DOCS;
286+
}
287+
return List.of(result);
288+
}
289+
290+
@Override
291+
public StoredFieldsSpec storedFieldsSpec() {
292+
return StoredFieldsSpec.NO_REQUIREMENTS;
293+
}
294+
}
295+
296+
private static class SparseVectorSyntheticFieldLoader implements SourceLoader.SyntheticFieldLoader {
297+
private final String fullPath;
298+
private final String leafName;
299+
300+
private TermsEnum termsDocEnum;
301+
302+
private SparseVectorSyntheticFieldLoader(String fullPath, String leafName) {
303+
this.fullPath = fullPath;
304+
this.leafName = leafName;
305+
}
306+
307+
@Override
308+
public Stream<Map.Entry<String, StoredFieldLoader>> storedFieldLoaders() {
309+
return Stream.of();
310+
}
311+
312+
@Override
313+
public DocValuesLoader docValuesLoader(LeafReader leafReader, int[] docIdsInLeaf) throws IOException {
314+
var fieldInfos = leafReader.getFieldInfos().fieldInfo(fullPath);
315+
if (fieldInfos == null || fieldInfos.hasVectors() == false) {
316+
return null;
317+
}
318+
return docId -> {
319+
var terms = leafReader.termVectors().get(docId, fullPath);
320+
if (terms == null) {
321+
return false;
322+
}
323+
termsDocEnum = terms.iterator();
324+
if (termsDocEnum.next() == null) {
325+
termsDocEnum = null;
326+
return false;
327+
}
328+
return true;
329+
};
330+
}
331+
332+
@Override
333+
public boolean hasValue() {
334+
return termsDocEnum != null;
335+
}
336+
337+
@Override
338+
public void write(XContentBuilder b) throws IOException {
339+
assert termsDocEnum != null;
340+
PostingsEnum reuse = null;
341+
b.startObject(leafName);
342+
do {
343+
reuse = termsDocEnum.postings(reuse);
344+
reuse.nextDoc();
345+
b.field(termsDocEnum.term().utf8ToString(), XFeatureField.decodeFeatureValue(reuse.freq()));
346+
} while (termsDocEnum.next() != null);
347+
b.endObject();
348+
}
349+
350+
@Override
351+
public String fieldName() {
352+
return leafName;
353+
}
354+
355+
@Override
356+
public void reset() {
357+
termsDocEnum = null;
358+
}
359+
}
360+
222361
}

0 commit comments

Comments
 (0)