From 453cf48d885092cb8e5bb8b1b086da1778c2d273 Mon Sep 17 00:00:00 2001 From: Jim Ferenczi Date: Mon, 7 Jul 2025 18:36:34 +0100 Subject: [PATCH 1/3] Add synthetic vectors support for sparse_vector This change adds the support for synthetic vectors (added in #130382) in the sparse_vector field type. --- ...=> 240_source_synthetic_dense_vectors.yml} | 0 .../250_source_synthetic_sparse_vectors.yml | 380 ++++++++++++++++++ .../vectors/DenseVectorFieldMapper.java | 34 +- .../vectors/SparseVectorFieldMapper.java | 59 ++- .../SyntheticVectorsPatchFieldLoader.java | 41 ++ .../vectors/SparseVectorFieldMapperTests.java | 39 +- .../mapper/SemanticTextFieldMapper.java | 2 +- .../src/main/java/module-info.java | 1 + .../mapper/RankVectorsFieldMapper.java | 39 +- 9 files changed, 511 insertions(+), 84 deletions(-) rename rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search.vectors/{240_source_synthetic_vectors.yml => 240_source_synthetic_dense_vectors.yml} (100%) create mode 100644 rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search.vectors/250_source_synthetic_sparse_vectors.yml create mode 100644 server/src/main/java/org/elasticsearch/index/mapper/vectors/SyntheticVectorsPatchFieldLoader.java diff --git a/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search.vectors/240_source_synthetic_vectors.yml b/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search.vectors/240_source_synthetic_dense_vectors.yml similarity index 100% rename from rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search.vectors/240_source_synthetic_vectors.yml rename to rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search.vectors/240_source_synthetic_dense_vectors.yml diff --git a/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search.vectors/250_source_synthetic_sparse_vectors.yml b/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search.vectors/250_source_synthetic_sparse_vectors.yml new file mode 100644 index 0000000000000..53f0cd33da7d3 --- /dev/null +++ b/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search.vectors/250_source_synthetic_sparse_vectors.yml @@ -0,0 +1,380 @@ +setup: + - requires: + reason: 'synthetic vectors are required' + test_runner_features: [ capabilities ] + capabilities: + - method: GET + path: /_search + capabilities: [ synthetic_vectors_setting ] + - skip: + features: "headers" + + - do: + indices.create: + index: test + body: + settings: + index.mapping.synthetic_vectors: true + mappings: + properties: + name: + type: keyword + emb: + type: sparse_vector + + nested: + type: nested + properties: + paragraph_id: + type: keyword + emb: + type: sparse_vector + + - do: + index: + index: test + id: "1" + body: + name: cow.jpg + emb: + token_1: 2.0 + token_2: 3.0 + + - do: + index: + index: test + id: "2" + body: + name: moose.jpg + nested: + - paragraph_id: 0 + emb: + token_1: 2.0 + token_2: 3.0 + - paragraph_id: 2 + emb: + token_3: 2.0 + token_2: 3.0 + - paragraph_id: 3 + emb: + token_3: 2.0 + token_7: 3.0 + token_1: 4.0 + + - do: + index: + index: test + id: "3" + body: + name: rabbit.jpg + emb: + token_3: 2.0 + token_9: 3.0 + token_2: 4.0 + + - do: + index: + index: test + id: "4" + body: + name: zoolander.jpg + nested: + - paragraph_id: 0 + emb: + token_3: 2.0 + token_7: 3.0 + token_1: 4.0 + - paragraph_id: 1 + - paragraph_id: 2 + emb: + token_8: 2.0 + + - do: + indices.refresh: {} + +--- +"exclude synthetic vectors": + - do: + search: + index: test + body: + sort: ["name"] + + - match: { hits.hits.0._id: "1"} + - match: { hits.hits.0._source.name: "cow.jpg"} + - not_exists: hits.hits.0._source.emb + + - match: { hits.hits.1._id: "2"} + - match: { hits.hits.1._source.name: "moose.jpg"} + - length: { hits.hits.1._source.nested: 3 } + - not_exists: hits.hits.1._source.nested.0.emb + - match: { hits.hits.1._source.nested.0.paragraph_id: 0 } + - not_exists: hits.hits.1._source.nested.1.emb + - match: { hits.hits.1._source.nested.1.paragraph_id: 2 } + - not_exists: hits.hits.1._source.nested.2.emb + - match: { hits.hits.1._source.nested.2.paragraph_id: 3 } + + - match: { hits.hits.2._id: "3" } + - match: { hits.hits.2._source.name: "rabbit.jpg" } + - not_exists: hits.hits.2._source.emb + + - match: { hits.hits.3._id: "4" } + - match: { hits.hits.3._source.name: "zoolander.jpg" } + - length: { hits.hits.3._source.nested: 3 } + - not_exists: hits.hits.3._source.nested.0.emb + - match: { hits.hits.3._source.nested.0.paragraph_id: 0 } + - match: { hits.hits.3._source.nested.1.paragraph_id: 1 } + - not_exists: hits.hits.3._source.nested.2.emb + - match: { hits.hits.3._source.nested.2.paragraph_id: 2 } + +--- +"include synthetic vectors": + - do: + search: + index: test + body: + _source: + exclude_vectors: false + sort: ["name"] + + - match: { hits.hits.0._id: "1"} + - match: { hits.hits.0._source.name: "cow.jpg"} + - exists: hits.hits.0._source.emb + + - match: { hits.hits.1._id: "2"} + - match: { hits.hits.1._source.name: "moose.jpg"} + - length: { hits.hits.1._source.nested: 3 } + - exists: hits.hits.1._source.nested.0.emb + - match: { hits.hits.1._source.nested.0.paragraph_id: 0 } + - exists: hits.hits.1._source.nested.1.emb + - match: { hits.hits.1._source.nested.1.paragraph_id: 2 } + - exists: hits.hits.1._source.nested.2.emb + - match: { hits.hits.1._source.nested.2.paragraph_id: 3 } + + - match: { hits.hits.2._id: "3" } + - match: { hits.hits.2._source.name: "rabbit.jpg" } + - exists: hits.hits.2._source.emb + + - match: { hits.hits.3._id: "4" } + - match: { hits.hits.3._source.name: "zoolander.jpg" } + - length: { hits.hits.3._source.nested: 3 } + - exists: hits.hits.3._source.nested.0.emb + - length: { hits.hits.3._source.nested.0.emb: 3 } + - match: { hits.hits.3._source.nested.0.paragraph_id: 0 } + + - do: + search: + index: test + body: + _source: + exclude_vectors: false + includes: nested.emb + sort: ["name"] + + - match: { hits.hits.0._id: "1"} + - length: { hits.hits.0._source: 0} + + - match: { hits.hits.1._id: "2"} + - length: { hits.hits.3._source: 1 } + - length: { hits.hits.1._source.nested: 3 } + - exists: hits.hits.1._source.nested.0.emb + - not_exists: hits.hits.1._source.nested.0.paragraph_id + - exists: hits.hits.1._source.nested.1.emb + - not_exists: hits.hits.1._source.nested.1.paragraph_id + - exists: hits.hits.1._source.nested.2.emb + - not_exists: hits.hits.1._source.nested.2.paragraph_id + + - match: { hits.hits.2._id: "3" } + - length: { hits.hits.2._source: 0} + + - match: { hits.hits.3._id: "4" } + - length: { hits.hits.3._source: 1 } + - length: { hits.hits.3._source.nested: 2 } + - exists: hits.hits.3._source.nested.0.emb + - length: { hits.hits.3._source.nested.0.emb: 3 } + - not_exists: hits.hits.3._source.nested.0.paragraph_id + - exists: hits.hits.3._source.nested.1.emb + - length: { hits.hits.3._source.nested.1.emb: 1 } + - not_exists: hits.hits.3._source.nested.1.paragraph_id + + - do: + headers: + # Force JSON content type so that we use a parser that interprets the embeddings as doubles + Content-Type: application/json + search: + index: test + body: + _source: + exclude_vectors: true + sort: ["name"] + fields: ["emb"] + + - match: { hits.hits.0._id: "1"} + - match: { hits.hits.0._source.name: "cow.jpg"} + - not_exists: hits.hits.0._source.emb + - length: { hits.hits.0.fields.emb: 1} + - length: { hits.hits.0.fields.emb.0: 2} + - match: { hits.hits.0.fields.emb.0.token_1: 2.0} + - match: { hits.hits.0.fields.emb.0.token_2: 3.0} + + - match: { hits.hits.1._id: "2"} + - match: { hits.hits.1._source.name: "moose.jpg"} + - length: { hits.hits.1._source.nested: 3 } + - not_exists: hits.hits.1._source.nested.0.emb + + - match: { hits.hits.2._id: "3" } + - match: { hits.hits.2._source.name: "rabbit.jpg" } + - length: { hits.hits.2.fields.emb: 1} + - length: { hits.hits.2.fields.emb.0: 3} + - match: { hits.hits.2.fields.emb.0.token_2: 4.0} + - match: { hits.hits.2.fields.emb.0.token_3: 2.0} + - match: { hits.hits.2.fields.emb.0.token_9: 3.0} + + - match: { hits.hits.3._id: "4" } + - match: { hits.hits.3._source.name: "zoolander.jpg" } + - length: { hits.hits.3._source.nested: 3 } + - not_exists: hits.hits.3._source.nested.0.emb + + +--- +"Bulk partial update with synthetic vectors": + - do: + headers: + # Force JSON content type so that we use a parser that interprets the embeddings as doubles + Content-Type: application/json + bulk: + index: test + _source: true + body: + - '{"update": {"_id": "4"}}' + - > + { + "doc": { + "name": "zoolander2.jpg", + "emb": { + "token_12": 2.0, + "token_13": 1.0 + } + } + } + + - length: { items.0.update.get._source.emb: 2 } + - match: { items.0.update.get._source.emb.token_12: 2.0 } + - match: { items.0.update.get._source.emb.token_13: 1.0 } + - exists: items.0.update.get._source.nested + - length: { items.0.update.get._source.nested: 3} + - exists: items.0.update.get._source.nested.0.emb + - match: { items.0.update.get._source.nested.0.paragraph_id: 0 } + - length: { items.0.update.get._source.nested.0.emb: 3 } + - not_exists: items.0.update.get._source.nested.1.emb + - match: { items.0.update.get._source.nested.1.paragraph_id: 1 } + - exists: items.0.update.get._source.nested.2.emb + - length: { items.0.update.get._source.nested.2.emb: 1 } + - match: { items.0.update.get._source.nested.2.paragraph_id: 2 } + - set: { items.0.update.get._source.nested: original_nested } + + - do: + headers: + # Force JSON content type so that we use a parser that interprets the embeddings as doubles + Content-Type: application/json + get: + _source_exclude_vectors: false + index: test + id: "4" + + - match: { _source.name: zoolander2.jpg } + - length: { _source.emb: 2 } + - match: { _source.emb.token_12: 2.0 } + - match: { _source.emb.token_13: 1.0 } + - match: { _source.nested: $original_nested } + + - do: + indices.refresh: {} + + - do: + headers: + # Force JSON content type so that we use a parser that interprets the embeddings as doubles + Content-Type: application/json + search: + index: test + body: + _source: + "exclude_vectors": false + query: + term: + _id: 4 + + - match: { hits.total.value: 1 } + - match: { hits.total.relation: eq } + - match: { hits.hits.0._source.name: zoolander2.jpg } + - match: { hits.hits.0._source.nested: $original_nested } + +--- +"Partial update with synthetic vectors": + - do: + headers: + # Force JSON content type so that we use a parser that interprets the vectors as doubles + Content-Type: application/json + update: + index: test + id: "4" + body: + _source: true + doc: { + "name": "zoolander3.jpg", + "emb": { + "token_3": 2.0, + "token_9": 2.5 + } + } + + - length: { get._source.emb: 2 } + - match: { get._source.emb.token_3: 2.0 } + - match: { get._source.emb.token_9: 2.5 } + - exists: get._source.nested + - length: { get._source.nested: 3} + - exists: get._source.nested.0.emb + - match: { get._source.nested.0.paragraph_id: 0 } + - length: { get._source.nested.0.emb: 3 } + - not_exists: get._source.nested.1.emb + - match: { get._source.nested.1.paragraph_id: 1 } + - exists: get._source.nested.2.emb + - length: { get._source.nested.2.emb: 1 } + - match: { get._source.nested.2.paragraph_id: 2 } + - set: { get._source.nested: original_nested } + + - do: + headers: + # Force JSON content type so that we use a parser that interprets the vectors as doubles + Content-Type: application/json + get: + _source_exclude_vectors: false + index: test + id: "4" + + - length: { _source.emb: 2 } + - match: { _source.emb.token_3: 2.0 } + - match: { _source.emb.token_9: 2.5 } + - match: { _source.name: zoolander3.jpg } + - match: { _source.nested: $original_nested } + + - do: + indices.refresh: {} + + - do: + headers: + # Force JSON content type so that we use a parser that interprets the vectors as doubles + Content-Type: application/json + search: + index: test + body: + _source: + "exclude_vectors": false + query: + term: + _id: 4 + + - match: { hits.total.value: 1 } + - match: { hits.total.relation: eq } + - match: { hits.hits.0._source.name: zoolander3.jpg } + - match: { hits.hits.0._source.nested: $original_nested } diff --git a/server/src/main/java/org/elasticsearch/index/mapper/vectors/DenseVectorFieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/vectors/DenseVectorFieldMapper.java index ef21355d3356b..4d1c4fc41526c 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/vectors/DenseVectorFieldMapper.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/vectors/DenseVectorFieldMapper.java @@ -25,7 +25,6 @@ import org.apache.lucene.index.FloatVectorValues; import org.apache.lucene.index.KnnVectorValues; import org.apache.lucene.index.LeafReader; -import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.NumericDocValues; import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SegmentWriteState; @@ -3028,9 +3027,11 @@ public String toString() { @Override public SourceLoader.SyntheticVectorsLoader syntheticVectorsLoader() { - return isSyntheticVector - ? new SyntheticDenseVectorPatchLoader(new IndexedSyntheticFieldLoader(indexCreatedVersion, fieldType().similarity)) - : null; + if (isSyntheticVector) { + var syntheticField = new IndexedSyntheticFieldLoader(indexCreatedVersion, fieldType().similarity); + return new SyntheticVectorsPatchFieldLoader(syntheticField, syntheticField::copyVectorAsList); + } + return null; } @Override @@ -3127,7 +3128,7 @@ public void write(XContentBuilder b) throws IOException { * * @throws IOException if reading fails */ - private Object copyVectorAsList() throws IOException { + private List copyVectorAsList() throws IOException { assert hasValue : "vector is null for ord=" + ord; if (floatValues != null) { float[] raw = floatValues.vectorValue(ord); @@ -3218,29 +3219,6 @@ public String fieldName() { } } - public class SyntheticDenseVectorPatchLoader implements SourceLoader.SyntheticVectorsLoader { - private final IndexedSyntheticFieldLoader syntheticFieldLoader; - - public SyntheticDenseVectorPatchLoader(IndexedSyntheticFieldLoader syntheticFieldLoader) { - this.syntheticFieldLoader = syntheticFieldLoader; - } - - public SourceLoader.SyntheticVectorsLoader.Leaf leaf(LeafReaderContext context) throws IOException { - var dvLoader = syntheticFieldLoader.docValuesLoader(context.reader(), null); - return (doc, acc) -> { - if (dvLoader == null) { - return; - } - if (dvLoader.advanceToDoc(doc) && syntheticFieldLoader.hasValue()) { - // add vectors as list since that's how they're parsed from xcontent. - acc.add( - new SourceLoader.LeafSyntheticVectorPath(syntheticFieldLoader.fieldName(), syntheticFieldLoader.copyVectorAsList()) - ); - } - }; - } - } - /** * Interface for a function that takes a int and boolean */ diff --git a/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java index bb212c81f7477..8eacc68b45c88 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java @@ -63,6 +63,7 @@ import java.util.Objects; import java.util.stream.Stream; +import static org.elasticsearch.index.IndexSettings.INDEX_MAPPING_SOURCE_SYNTHETIC_VECTORS_SETTING; import static org.elasticsearch.index.query.AbstractQueryBuilder.DEFAULT_BOOST; import static org.elasticsearch.xcontent.ConstructingObjectParser.optionalConstructorArg; @@ -107,9 +108,12 @@ public static class Builder extends FieldMapper.Builder { Objects::toString ).acceptsNull().setSerializerCheck(this::indexOptionsSerializerCheck); - public Builder(String name, IndexVersion indexVersionCreated) { + private boolean isSyntheticVector; + + public Builder(String name, IndexVersion indexVersionCreated, boolean isSyntheticVector) { super(name); this.indexVersionCreated = indexVersionCreated; + this.isSyntheticVector = isSyntheticVector; } public Builder setStored(boolean value) { @@ -129,16 +133,19 @@ public SparseVectorFieldMapper build(MapperBuilderContext context) { builderIndexOptions = getDefaultIndexOptions(indexVersionCreated); } + final boolean syntheticVectorFinal = context.isSourceSynthetic() == false && isSyntheticVector; + final boolean storedFinal = stored.getValue() || syntheticVectorFinal; return new SparseVectorFieldMapper( leafName(), new SparseVectorFieldType( indexVersionCreated, context.buildFullName(leafName()), - stored.getValue(), + storedFinal, meta.getValue(), builderIndexOptions ), - builderParams(this, context) + builderParams(this, context), + syntheticVectorFinal ); } @@ -196,7 +203,11 @@ private static SparseVectorFieldMapper.IndexOptions parseIndexOptions(MappingPar throw new IllegalArgumentException(ERROR_MESSAGE_8X); } - return new Builder(n, c.indexVersionCreated()); + return new Builder( + n, + c.indexVersionCreated(), + INDEX_MAPPING_SOURCE_SYNTHETIC_VECTORS_SETTING.get(c.getIndexSettings().getSettings()) + ); }, notInMultiFields(CONTENT_TYPE)); public static final class SparseVectorFieldType extends MappedFieldType { @@ -302,8 +313,16 @@ private static String indexedValueForSearch(Object value) { } } - private SparseVectorFieldMapper(String simpleName, MappedFieldType mappedFieldType, BuilderParams builderParams) { + private final boolean isSyntheticVector; + + private SparseVectorFieldMapper( + String simpleName, + MappedFieldType mappedFieldType, + BuilderParams builderParams, + boolean isSyntheticVector + ) { super(simpleName, mappedFieldType, builderParams); + this.isSyntheticVector = isSyntheticVector; } @Override @@ -314,6 +333,15 @@ protected SyntheticSourceSupport syntheticSourceSupport() { return super.syntheticSourceSupport(); } + @Override + public SourceLoader.SyntheticVectorsLoader syntheticVectorsLoader() { + if (isSyntheticVector) { + var syntheticField = new SparseVectorSyntheticFieldLoader(fullPath(), leafName()); + return new SyntheticVectorsPatchFieldLoader(syntheticField, syntheticField::copyAsMap); + } + return null; + } + @Override public Map indexAnalyzers() { return Map.of(mappedFieldType.name(), Lucene.KEYWORD_ANALYZER); @@ -321,7 +349,7 @@ public Map indexAnalyzers() { @Override public FieldMapper.Builder getMergeBuilder() { - return new Builder(leafName(), this.fieldType().indexVersionCreated).init(this); + return new Builder(leafName(), this.fieldType().indexVersionCreated, this.isSyntheticVector).init(this); } @Override @@ -504,9 +532,26 @@ public void write(XContentBuilder b) throws IOException { b.endObject(); } + /** + * Returns a deep-copied tokens map for the current document. + * + * @throws IOException if reading fails + */ + private Map copyAsMap() throws IOException { + assert termsDocEnum != null; + Map tokenMap = new LinkedHashMap<>(); + PostingsEnum reuse = null; + do { + reuse = termsDocEnum.postings(reuse); + reuse.nextDoc(); + tokenMap.put(termsDocEnum.term().utf8ToString(), XFeatureField.decodeFeatureValue(reuse.freq())); + } while (termsDocEnum.next() != null); + return tokenMap; + } + @Override public String fieldName() { - return leafName; + return fullPath; } @Override diff --git a/server/src/main/java/org/elasticsearch/index/mapper/vectors/SyntheticVectorsPatchFieldLoader.java b/server/src/main/java/org/elasticsearch/index/mapper/vectors/SyntheticVectorsPatchFieldLoader.java new file mode 100644 index 0000000000000..53ce2335541a6 --- /dev/null +++ b/server/src/main/java/org/elasticsearch/index/mapper/vectors/SyntheticVectorsPatchFieldLoader.java @@ -0,0 +1,41 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.index.mapper.vectors; + +import org.apache.lucene.index.LeafReaderContext; +import org.elasticsearch.core.CheckedSupplier; +import org.elasticsearch.index.mapper.SourceLoader; + +import java.io.IOException; + +public class SyntheticVectorsPatchFieldLoader implements SourceLoader.SyntheticVectorsLoader { + private final SourceLoader.SyntheticFieldLoader syntheticLoader; + private final CheckedSupplier copyObject; + + public SyntheticVectorsPatchFieldLoader( + SourceLoader.SyntheticFieldLoader syntheticLoader, + CheckedSupplier copyObject + ) { + this.syntheticLoader = syntheticLoader; + this.copyObject = copyObject; + } + + public SourceLoader.SyntheticVectorsLoader.Leaf leaf(LeafReaderContext context) throws IOException { + var dvLoader = syntheticLoader.docValuesLoader(context.reader(), null); + return (doc, acc) -> { + if (dvLoader == null) { + return; + } + if (dvLoader.advanceToDoc(doc) && syntheticLoader.hasValue()) { + acc.add(new SourceLoader.LeafSyntheticVectorPath(syntheticLoader.fieldName(), copyObject.get())); + } + }; + } +} diff --git a/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java b/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java index 2099a4b138264..dc21e5f8f57cc 100644 --- a/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java +++ b/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java @@ -26,6 +26,7 @@ import org.elasticsearch.common.compress.CompressedXContent; import org.elasticsearch.core.CheckedConsumer; import org.elasticsearch.core.Nullable; +import org.elasticsearch.core.Tuple; import org.elasticsearch.index.IndexVersion; import org.elasticsearch.index.IndexVersions; import org.elasticsearch.index.mapper.DocumentMapper; @@ -33,7 +34,6 @@ import org.elasticsearch.index.mapper.MappedFieldType; import org.elasticsearch.index.mapper.MapperParsingException; import org.elasticsearch.index.mapper.MapperService; -import org.elasticsearch.index.mapper.MapperTestCase; import org.elasticsearch.index.mapper.ParsedDocument; import org.elasticsearch.index.query.SearchExecutionContext; import org.elasticsearch.inference.WeightedToken; @@ -54,6 +54,7 @@ import java.util.LinkedHashMap; import java.util.List; import java.util.Map; +import java.util.TreeMap; import static org.elasticsearch.index.IndexVersions.SPARSE_VECTOR_PRUNING_INDEX_OPTIONS_SUPPORT; import static org.elasticsearch.index.IndexVersions.UPGRADE_TO_LUCENE_10_0_0; @@ -66,14 +67,13 @@ import static org.hamcrest.Matchers.equalTo; import static org.hamcrest.Matchers.instanceOf; -public class SparseVectorFieldMapperTests extends MapperTestCase { +public class SparseVectorFieldMapperTests extends SyntheticVectorsMapperTestCase { @Override protected Object getSampleValueForDocument() { - Map map = new LinkedHashMap<>(); - map.put("ten", 10f); - map.put("twenty", 20f); - return map; + return new TreeMap<>( + randomMap(1, 5, () -> Tuple.tuple(randomAlphaOfLengthBetween(5, 10), Float.valueOf(randomIntBetween(1, 127)))) + ); } @Override @@ -209,26 +209,22 @@ public void testDefaults() throws Exception { DocumentMapper mapper = createDocumentMapper(fieldMapping(this::minimalMapping)); assertEquals(Strings.toString(fieldMapping(this::minimalMapping)), mapper.mappingSource().toString()); - ParsedDocument doc1 = mapper.parse(source(this::writeField)); + @SuppressWarnings("unchecked") + var expected = (Map) getSampleValueForDocument(); + ParsedDocument doc1 = mapper.parse(source(b -> b.field("field", expected))); List fields = doc1.rootDoc().getFields("field"); - assertEquals(2, fields.size()); + assertEquals(expected.size(), fields.size()); assertThat(fields.get(0), Matchers.instanceOf(FeatureField.class)); - FeatureField featureField1 = null; - FeatureField featureField2 = null; + for (IndexableField field : fields) { - if (field.stringValue().equals("ten")) { - featureField1 = (FeatureField) field; - } else if (field.stringValue().equals("twenty")) { - featureField2 = (FeatureField) field; - } else { - throw new UnsupportedOperationException(); + if (field instanceof FeatureField fField) { + var value = expected.remove(fField.stringValue()); + assertThat(fField.getFeatureValue(), equalTo(value)); + int freq1 = getFrequency(fField.tokenStream(null, null)); + assertThat(XFeatureField.decodeFeatureValue(freq1), equalTo(value)); } } - - int freq1 = getFrequency(featureField1.tokenStream(null, null)); - int freq2 = getFrequency(featureField2.tokenStream(null, null)); - assertTrue(freq1 < freq2); } public void testDefaultsWithAndWithoutIncludeDefaults() throws Exception { @@ -460,7 +456,8 @@ public boolean preservesExactSource() { @Override public SyntheticSourceExample example(int maxValues) { - return new SyntheticSourceExample(getSampleValueForDocument(), getSampleValueForDocument(), b -> { + var sample = getSampleValueForDocument(); + return new SyntheticSourceExample(sample, sample, b -> { if (withStore) { minimalStoreMapping(b); } else { diff --git a/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/SemanticTextFieldMapper.java b/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/SemanticTextFieldMapper.java index e16b41a28e274..9972fa9e5ae0b 100644 --- a/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/SemanticTextFieldMapper.java +++ b/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/SemanticTextFieldMapper.java @@ -1169,7 +1169,7 @@ private static Mapper.Builder createEmbeddingsField( boolean useLegacyFormat ) { return switch (modelSettings.taskType()) { - case SPARSE_EMBEDDING -> new SparseVectorFieldMapper.Builder(CHUNKED_EMBEDDINGS_FIELD, indexVersionCreated).setStored( + case SPARSE_EMBEDDING -> new SparseVectorFieldMapper.Builder(CHUNKED_EMBEDDINGS_FIELD, indexVersionCreated, false).setStored( useLegacyFormat == false ); case TEXT_EMBEDDING -> { diff --git a/x-pack/plugin/rank-vectors/src/main/java/module-info.java b/x-pack/plugin/rank-vectors/src/main/java/module-info.java index 4af3c994edd38..476ad2c3c0f4c 100644 --- a/x-pack/plugin/rank-vectors/src/main/java/module-info.java +++ b/x-pack/plugin/rank-vectors/src/main/java/module-info.java @@ -6,6 +6,7 @@ */ module org.elasticsearch.rank.vectors { + requires org.elasticsearch.base; requires org.elasticsearch.xcore; requires org.elasticsearch.painless.spi; requires org.elasticsearch.server; diff --git a/x-pack/plugin/rank-vectors/src/main/java/org/elasticsearch/xpack/rank/vectors/mapper/RankVectorsFieldMapper.java b/x-pack/plugin/rank-vectors/src/main/java/org/elasticsearch/xpack/rank/vectors/mapper/RankVectorsFieldMapper.java index 4dd8cefc0115c..a79fb4f304f6a 100644 --- a/x-pack/plugin/rank-vectors/src/main/java/org/elasticsearch/xpack/rank/vectors/mapper/RankVectorsFieldMapper.java +++ b/x-pack/plugin/rank-vectors/src/main/java/org/elasticsearch/xpack/rank/vectors/mapper/RankVectorsFieldMapper.java @@ -10,7 +10,6 @@ import org.apache.lucene.document.BinaryDocValuesField; import org.apache.lucene.index.BinaryDocValues; import org.apache.lucene.index.LeafReader; -import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.search.FieldExistsQuery; import org.apache.lucene.search.Query; import org.apache.lucene.util.BytesRef; @@ -30,6 +29,7 @@ import org.elasticsearch.index.mapper.TextSearchInfo; import org.elasticsearch.index.mapper.ValueFetcher; import org.elasticsearch.index.mapper.vectors.DenseVectorFieldMapper; +import org.elasticsearch.index.mapper.vectors.SyntheticVectorsPatchFieldLoader; import org.elasticsearch.index.query.SearchExecutionContext; import org.elasticsearch.license.LicenseUtils; import org.elasticsearch.license.XPackLicenseState; @@ -406,7 +406,11 @@ protected SyntheticSourceSupport syntheticSourceSupport() { @Override public SourceLoader.SyntheticVectorsLoader syntheticVectorsLoader() { - return isSyntheticVector ? new SyntheticRankVectorPatchLoader(new DocValuesSyntheticFieldLoader()) : null; + if (isSyntheticVector) { + var syntheticField = new DocValuesSyntheticFieldLoader(); + return new SyntheticVectorsPatchFieldLoader(syntheticField, syntheticField::copyVectorsAsList); + } + return null; } private class DocValuesSyntheticFieldLoader extends SourceLoader.DocValuesBasedSyntheticFieldLoader { @@ -455,7 +459,12 @@ public void write(XContentBuilder b) throws IOException { b.endArray(); } - private Object copyVectorsAsList() throws IOException { + /** + * Returns deep-copied vectors for the current document, either as a list. + * + * @throws IOException if reading fails + */ + private List> copyVectorsAsList() throws IOException { assert hasValue : "rank vector is null"; BytesRef ref = values.binaryValue(); ByteBuffer byteBuffer = ByteBuffer.wrap(ref.bytes, ref.offset, ref.length).order(ByteOrder.LITTLE_ENDIAN); @@ -492,28 +501,4 @@ public String fieldName() { return fullPath(); } } - - private class SyntheticRankVectorPatchLoader implements SourceLoader.SyntheticVectorsLoader { - private final DocValuesSyntheticFieldLoader syntheticFieldLoader; - - private SyntheticRankVectorPatchLoader(DocValuesSyntheticFieldLoader syntheticFieldLoader) { - this.syntheticFieldLoader = syntheticFieldLoader; - } - - @Override - public SourceLoader.SyntheticVectorsLoader.Leaf leaf(LeafReaderContext context) throws IOException { - var dvLoader = syntheticFieldLoader.docValuesLoader(context.reader(), null); - return (doc, acc) -> { - if (dvLoader == null) { - return; - } - if (dvLoader.advanceToDoc(doc) && syntheticFieldLoader.hasValue()) { - // add vectors as list since that's how they're parsed from xcontent. - acc.add( - new SourceLoader.LeafSyntheticVectorPath(syntheticFieldLoader.fieldName(), syntheticFieldLoader.copyVectorsAsList()) - ); - } - }; - } - } } From b617a8cbecd037d5950db62f0c16dca6c06149bb Mon Sep 17 00:00:00 2001 From: Jim Ferenczi Date: Mon, 7 Jul 2025 18:37:58 +0100 Subject: [PATCH 2/3] Update docs/changelog/130756.yaml --- docs/changelog/130756.yaml | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 docs/changelog/130756.yaml diff --git a/docs/changelog/130756.yaml b/docs/changelog/130756.yaml new file mode 100644 index 0000000000000..d5d4d3821b115 --- /dev/null +++ b/docs/changelog/130756.yaml @@ -0,0 +1,5 @@ +pr: 130756 +summary: Add synthetic vectors support for `sparse_vector` +area: Vector Search +type: enhancement +issues: [] From 8353ebdd5eb1426600936433f7126a1abad7223c Mon Sep 17 00:00:00 2001 From: Jim Ferenczi Date: Mon, 7 Jul 2025 18:38:44 +0100 Subject: [PATCH 3/3] Delete docs/changelog/130756.yaml Remove changelog, this is behind a feature flag for now. --- docs/changelog/130756.yaml | 5 ----- 1 file changed, 5 deletions(-) delete mode 100644 docs/changelog/130756.yaml diff --git a/docs/changelog/130756.yaml b/docs/changelog/130756.yaml deleted file mode 100644 index d5d4d3821b115..0000000000000 --- a/docs/changelog/130756.yaml +++ /dev/null @@ -1,5 +0,0 @@ -pr: 130756 -summary: Add synthetic vectors support for `sparse_vector` -area: Vector Search -type: enhancement -issues: []