diff --git a/server/src/main/java/org/elasticsearch/index/mapper/vectors/DenseVectorFieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/vectors/DenseVectorFieldMapper.java index db6592e60f0af..ef21355d3356b 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/vectors/DenseVectorFieldMapper.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/vectors/DenseVectorFieldMapper.java @@ -2853,10 +2853,6 @@ private DenseVectorFieldMapper( this.isSyntheticVector = isSyntheticVector; } - public boolean isSyntheticVector() { - return isSyntheticVector; - } - @Override public DenseVectorFieldType fieldType() { return (DenseVectorFieldType) super.fieldType(); @@ -3032,7 +3028,7 @@ public String toString() { @Override public SourceLoader.SyntheticVectorsLoader syntheticVectorsLoader() { - return isSyntheticVector() + return isSyntheticVector ? new SyntheticDenseVectorPatchLoader(new IndexedSyntheticFieldLoader(indexCreatedVersion, fieldType().similarity)) : null; } @@ -3131,7 +3127,7 @@ public void write(XContentBuilder b) throws IOException { * * @throws IOException if reading fails */ - public Object copyVectorAsList() throws IOException { + private Object copyVectorAsList() throws IOException { assert hasValue : "vector is null for ord=" + ord; if (floatValues != null) { float[] raw = floatValues.vectorValue(ord); @@ -3235,8 +3231,7 @@ public SourceLoader.SyntheticVectorsLoader.Leaf leaf(LeafReaderContext context) if (dvLoader == null) { return; } - dvLoader.advanceToDoc(doc); - if (syntheticFieldLoader.hasValue()) { + if (dvLoader.advanceToDoc(doc) && syntheticFieldLoader.hasValue()) { // add vectors as list since that's how they're parsed from xcontent. acc.add( new SourceLoader.LeafSyntheticVectorPath(syntheticFieldLoader.fieldName(), syntheticFieldLoader.copyVectorAsList()) diff --git a/server/src/test/java/org/elasticsearch/index/mapper/vectors/DenseVectorFieldMapperTests.java b/server/src/test/java/org/elasticsearch/index/mapper/vectors/DenseVectorFieldMapperTests.java index 75b70f228321e..f79c14c831f86 100644 --- a/server/src/test/java/org/elasticsearch/index/mapper/vectors/DenseVectorFieldMapperTests.java +++ b/server/src/test/java/org/elasticsearch/index/mapper/vectors/DenseVectorFieldMapperTests.java @@ -11,28 +11,21 @@ import com.carrotsearch.randomizedtesting.generators.RandomPicks; -import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.codecs.Codec; import org.apache.lucene.codecs.KnnVectorsFormat; import org.apache.lucene.document.BinaryDocValuesField; import org.apache.lucene.document.KnnByteVectorField; import org.apache.lucene.document.KnnFloatVectorField; -import org.apache.lucene.index.DirectoryReader; -import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.IndexableField; import org.apache.lucene.index.VectorEncoding; import org.apache.lucene.index.VectorSimilarityFunction; import org.apache.lucene.search.FieldExistsQuery; import org.apache.lucene.search.Query; -import org.apache.lucene.tests.index.RandomIndexWriter; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.VectorUtil; -import org.elasticsearch.common.Strings; import org.elasticsearch.common.bytes.BytesReference; -import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.util.BigArrays; import org.elasticsearch.common.xcontent.XContentHelper; -import org.elasticsearch.index.IndexSettings; import org.elasticsearch.index.IndexVersion; import org.elasticsearch.index.IndexVersions; import org.elasticsearch.index.codec.CodecService; @@ -46,7 +39,6 @@ import org.elasticsearch.index.mapper.MapperBuilderContext; import org.elasticsearch.index.mapper.MapperParsingException; import org.elasticsearch.index.mapper.MapperService; -import org.elasticsearch.index.mapper.MapperTestCase; import org.elasticsearch.index.mapper.ParsedDocument; import org.elasticsearch.index.mapper.SourceToParse; import org.elasticsearch.index.mapper.ValueFetcher; @@ -61,7 +53,6 @@ import org.elasticsearch.test.ESTestCase; import org.elasticsearch.test.index.IndexVersionUtils; import org.elasticsearch.xcontent.XContentBuilder; -import org.elasticsearch.xcontent.XContentType; import org.junit.AssumptionViolatedException; import java.io.IOException; @@ -74,18 +65,16 @@ import static org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat.DEFAULT_BEAM_WIDTH; import static org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat.DEFAULT_MAX_CONN; import static org.apache.lucene.tests.index.BaseKnnVectorsFormatTestCase.randomNormalizedVector; -import static org.elasticsearch.index.IndexSettings.SYNTHETIC_VECTORS; import static org.elasticsearch.index.codec.vectors.IVFVectorsFormat.DYNAMIC_NPROBE; import static org.elasticsearch.index.mapper.vectors.DenseVectorFieldMapper.DEFAULT_OVERSAMPLE; import static org.elasticsearch.index.mapper.vectors.DenseVectorFieldMapper.IVF_FORMAT; -import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertToXContentEquivalent; import static org.hamcrest.Matchers.containsString; import static org.hamcrest.Matchers.equalTo; import static org.hamcrest.Matchers.instanceOf; import static org.mockito.Mockito.mock; import static org.mockito.Mockito.when; -public class DenseVectorFieldMapperTests extends MapperTestCase { +public class DenseVectorFieldMapperTests extends SyntheticVectorsMapperTestCase { private static final IndexVersion INDEXED_BY_DEFAULT_PREVIOUS_INDEX_VERSION = IndexVersions.V_8_10_0; private final ElementType elementType; @@ -95,7 +84,7 @@ public class DenseVectorFieldMapperTests extends MapperTestCase { public DenseVectorFieldMapperTests() { this.elementType = randomFrom(ElementType.BYTE, ElementType.FLOAT, ElementType.BIT); - this.indexed = randomBoolean(); + this.indexed = usually(); this.indexOptionsSet = this.indexed && randomBoolean(); int baseDims = ElementType.BIT == elementType ? 4 * Byte.SIZE : 4; int randomMultiplier = ElementType.FLOAT == elementType ? randomIntBetween(1, 64) : 1; @@ -160,10 +149,10 @@ private void indexMapping(XContentBuilder b, IndexVersion indexVersion) throws I protected Object getSampleValueForDocument() { return elementType == ElementType.FLOAT ? convertToList(randomNormalizedVector(this.dims)) - : List.of((byte) 1, (byte) 1, (byte) 1, (byte) 1); + : convertToList(randomByteArrayOfLength(elementType == ElementType.BIT ? this.dims / Byte.SIZE : dims)); } - private static List convertToList(float[] vector) { + public static List convertToList(float[] vector) { List list = new ArrayList<>(vector.length); for (float v : vector) { list.add(v); @@ -171,6 +160,14 @@ private static List convertToList(float[] vector) { return list; } + public static List convertToList(byte[] vector) { + List list = new ArrayList<>(vector.length); + for (byte v : vector) { + list.add(v); + } + return list; + } + @Override protected void registerParameters(ParameterChecker checker) throws IOException { checker.registerConflictCheck( @@ -2920,249 +2917,6 @@ public void testInvalidVectorDimensions() { } } - public void testSyntheticVectorsMinimalValidDocument() throws IOException { - assumeTrue("feature flag must be enabled for synthetic vectors", SYNTHETIC_VECTORS); - for (XContentType type : XContentType.values()) { - BytesReference source = generateRandomDoc(type, true, true, false, false, false); - assertSyntheticVectors(buildVectorMapping(), source, type); - } - } - - public void testSyntheticVectorsFullDocument() throws IOException { - assumeTrue("feature flag must be enabled for synthetic vectors", SYNTHETIC_VECTORS); - for (XContentType type : XContentType.values()) { - BytesReference source = generateRandomDoc(type, true, true, true, true, false); - assertSyntheticVectors(buildVectorMapping(), source, type); - } - } - - public void testSyntheticVectorsWithUnmappedFields() throws IOException { - assumeTrue("feature flag must be enabled for synthetic vectors", SYNTHETIC_VECTORS); - for (XContentType type : XContentType.values()) { - BytesReference source = generateRandomDoc(type, true, true, true, true, true); - assertSyntheticVectors(buildVectorMapping(), source, type); - } - } - - public void testSyntheticVectorsMissingRootFields() throws IOException { - assumeTrue("feature flag must be enabled for synthetic vectors", SYNTHETIC_VECTORS); - for (XContentType type : XContentType.values()) { - BytesReference source = generateRandomDoc(type, false, false, false, false, false); - assertSyntheticVectors(buildVectorMapping(), source, type); - } - } - - public void testSyntheticVectorsPartialNestedContent() throws IOException { - assumeTrue("feature flag must be enabled for synthetic vectors", SYNTHETIC_VECTORS); - for (XContentType type : XContentType.values()) { - BytesReference source = generateRandomDoc(type, true, true, true, false, false); - assertSyntheticVectors(buildVectorMapping(), source, type); - } - } - - public void testFlatPathDocument() throws IOException { - assumeTrue("feature flag must be enabled for synthetic vectors", SYNTHETIC_VECTORS); - for (XContentType type : XContentType.values()) { - BytesReference source = generateRandomDocWithFlatPath(type); - assertSyntheticVectors(buildVectorMapping(), source, type); - } - } - - private static String buildVectorMapping() throws IOException { - try (XContentBuilder builder = XContentBuilder.builder(XContentType.JSON.xContent())) { - builder.startObject(); // root - builder.startObject("_doc"); - builder.field("dynamic", "false"); - - builder.startObject("properties"); - - // field - builder.startObject("field"); - builder.field("type", "keyword"); - builder.endObject(); - - // emb - builder.startObject("emb"); - builder.field("type", "dense_vector"); - builder.field("dims", 3); - builder.field("similarity", "cosine"); - builder.endObject(); - - // another_field - builder.startObject("another_field"); - builder.field("type", "keyword"); - builder.endObject(); - - // obj - builder.startObject("obj"); - builder.startObject("properties"); - - // nested - builder.startObject("nested"); - builder.field("type", "nested"); - builder.startObject("properties"); - - // nested.field - builder.startObject("field"); - builder.field("type", "keyword"); - builder.endObject(); - - // nested.emb - builder.startObject("emb"); - builder.field("type", "dense_vector"); - builder.field("dims", 3); - builder.field("similarity", "cosine"); - builder.endObject(); - - // double_nested - builder.startObject("double_nested"); - builder.field("type", "nested"); - builder.startObject("properties"); - - // double_nested.field - builder.startObject("field"); - builder.field("type", "keyword"); - builder.endObject(); - - // double_nested.emb - builder.startObject("emb"); - builder.field("type", "dense_vector"); - builder.field("dims", 3); - builder.field("similarity", "cosine"); - builder.endObject(); - - builder.endObject(); // double_nested.properties - builder.endObject(); // double_nested - - builder.endObject(); // nested.properties - builder.endObject(); // nested - - builder.endObject(); // obj.properties - builder.endObject(); // obj - - builder.endObject(); // properties - builder.endObject(); // _doc - builder.endObject(); // root - - return Strings.toString(builder); - } - } - - private BytesReference generateRandomDoc( - XContentType xContentType, - boolean includeRootField, - boolean includeVector, - boolean includeNested, - boolean includeDoubleNested, - boolean includeUnmapped - ) throws IOException { - try (var builder = XContentBuilder.builder(xContentType.xContent())) { - builder.startObject(); - - if (includeRootField) { - builder.field("field", randomAlphaOfLengthBetween(1, 2)); - } - - if (includeVector) { - builder.array("emb", new float[] { 1, 2, 3 }); - } - - if (includeUnmapped) { - builder.field("unmapped_field", "extra"); - } - - builder.startObject("obj"); - if (includeNested) { - builder.startArray("nested"); - - // Entry with just a field - builder.startObject(); - builder.field("field", randomAlphaOfLengthBetween(3, 6)); - builder.endObject(); - - // Empty object - builder.startObject(); - builder.endObject(); - - // Entry with emb and double_nested - if (includeDoubleNested) { - builder.startObject(); - builder.array("emb", new float[] { 1, 2, 3 }); - builder.field("field", "nested_val"); - builder.startArray("double_nested"); - for (int i = 0; i < 2; i++) { - builder.startObject(); - builder.array("emb", new float[] { 1, 2, 3 }); - builder.field("field", "dn_field"); - builder.endObject(); - } - builder.endArray(); - builder.endObject(); - } - - builder.endArray(); - } - builder.endObject(); - - builder.endObject(); - return BytesReference.bytes(builder); - } - } - - private BytesReference generateRandomDocWithFlatPath(XContentType xContentType) throws IOException { - try (var builder = XContentBuilder.builder(xContentType.xContent())) { - builder.startObject(); - - // Root-level fields - builder.field("field", randomAlphaOfLengthBetween(1, 2)); - builder.array("emb", new float[] { 1, 2, 3 }); - builder.field("another_field", randomAlphaOfLengthBetween(3, 5)); - - // Simulated flattened "obj.nested" - builder.startObject("obj.nested"); - - builder.field("field", randomAlphaOfLengthBetween(4, 8)); - builder.array("emb", new float[] { 1, 2, 3 }); - - builder.startArray("double_nested"); - for (int i = 0; i < randomIntBetween(1, 2); i++) { - builder.startObject(); - builder.field("field", randomAlphaOfLengthBetween(4, 8)); - builder.array("emb", new float[] { 1, 2, 3 }); - builder.endObject(); - } - builder.endArray(); - - builder.endObject(); // end obj.nested - - builder.endObject(); - return BytesReference.bytes(builder); - } - } - - private void assertSyntheticVectors(String mapping, BytesReference source, XContentType xContentType) throws IOException { - var settings = Settings.builder().put(IndexSettings.INDEX_MAPPING_SOURCE_SYNTHETIC_VECTORS_SETTING.getKey(), true).build(); - MapperService mapperService = createMapperService(settings, mapping); - var parsedDoc = mapperService.documentMapper().parse(new SourceToParse("0", source, xContentType)); - try (var directory = newDirectory()) { - IndexWriterConfig config = newIndexWriterConfig(random(), new StandardAnalyzer()); - try (var iw = new RandomIndexWriter(random(), directory, config)) { - parsedDoc.updateSeqID(0, 1); - parsedDoc.version().setLongValue(0); - iw.addDocuments(parsedDoc.docs()); - } - try (var indexReader = wrapInMockESDirectoryReader(DirectoryReader.open(directory))) { - var provider = SourceProvider.fromLookup( - mapperService.mappingLookup(), - null, - mapperService.getMapperMetrics().sourceFieldMetrics() - ); - var searchSource = provider.getSource(indexReader.leaves().get(0), parsedDoc.docs().size() - 1); - assertToXContentEquivalent(source, searchSource.internalSourceRef(), xContentType); - } - } - } - @Override protected IngestScriptSupport ingestScriptSupport() { throw new AssumptionViolatedException("not supported"); diff --git a/server/src/test/java/org/elasticsearch/index/mapper/vectors/SyntheticVectorsMapperTestCase.java b/server/src/test/java/org/elasticsearch/index/mapper/vectors/SyntheticVectorsMapperTestCase.java new file mode 100644 index 0000000000000..ebb4fe788fea3 --- /dev/null +++ b/server/src/test/java/org/elasticsearch/index/mapper/vectors/SyntheticVectorsMapperTestCase.java @@ -0,0 +1,272 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.index.mapper.vectors; + +import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.tests.index.RandomIndexWriter; +import org.elasticsearch.common.Strings; +import org.elasticsearch.common.bytes.BytesReference; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.index.IndexSettings; +import org.elasticsearch.index.mapper.MapperService; +import org.elasticsearch.index.mapper.MapperTestCase; +import org.elasticsearch.index.mapper.SourceToParse; +import org.elasticsearch.search.lookup.SourceProvider; +import org.elasticsearch.xcontent.XContentBuilder; +import org.elasticsearch.xcontent.XContentType; + +import java.io.IOException; + +import static org.elasticsearch.index.IndexSettings.SYNTHETIC_VECTORS; +import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertToXContentEquivalent; + +public abstract class SyntheticVectorsMapperTestCase extends MapperTestCase { + public void testSyntheticVectorsMinimalValidDocument() throws IOException { + assumeTrue("feature flag must be enabled for synthetic vectors", SYNTHETIC_VECTORS); + for (XContentType type : XContentType.values()) { + BytesReference source = generateRandomDoc(type, true, true, false, false, false); + assertSyntheticVectors(buildVectorMapping(), source, type); + } + } + + public void testSyntheticVectorsFullDocument() throws IOException { + assumeTrue("feature flag must be enabled for synthetic vectors", SYNTHETIC_VECTORS); + for (XContentType type : XContentType.values()) { + BytesReference source = generateRandomDoc(type, true, true, true, true, false); + assertSyntheticVectors(buildVectorMapping(), source, type); + } + } + + public void testSyntheticVectorsWithUnmappedFields() throws IOException { + assumeTrue("feature flag must be enabled for synthetic vectors", SYNTHETIC_VECTORS); + for (XContentType type : XContentType.values()) { + BytesReference source = generateRandomDoc(type, true, true, true, true, true); + assertSyntheticVectors(buildVectorMapping(), source, type); + } + } + + public void testSyntheticVectorsMissingRootFields() throws IOException { + assumeTrue("feature flag must be enabled for synthetic vectors", SYNTHETIC_VECTORS); + for (XContentType type : XContentType.values()) { + BytesReference source = generateRandomDoc(type, false, false, false, false, false); + assertSyntheticVectors(buildVectorMapping(), source, type); + } + } + + public void testSyntheticVectorsPartialNestedContent() throws IOException { + assumeTrue("feature flag must be enabled for synthetic vectors", SYNTHETIC_VECTORS); + for (XContentType type : XContentType.values()) { + BytesReference source = generateRandomDoc(type, true, true, true, false, false); + assertSyntheticVectors(buildVectorMapping(), source, type); + } + } + + public void testFlatPathDocument() throws IOException { + assumeTrue("feature flag must be enabled for synthetic vectors", SYNTHETIC_VECTORS); + for (XContentType type : XContentType.values()) { + BytesReference source = generateRandomDocWithFlatPath(type); + assertSyntheticVectors(buildVectorMapping(), source, type); + } + } + + private String buildVectorMapping() throws IOException { + try (XContentBuilder builder = XContentBuilder.builder(XContentType.JSON.xContent())) { + builder.startObject(); // root + builder.startObject("_doc"); + builder.field("dynamic", "false"); + + builder.startObject("properties"); + + // field + builder.startObject("field"); + builder.field("type", "keyword"); + builder.endObject(); + + // emb + builder.startObject("emb"); + minimalMapping(builder); + builder.endObject(); + + // another_field + builder.startObject("another_field"); + builder.field("type", "keyword"); + builder.endObject(); + + // obj + builder.startObject("obj"); + builder.startObject("properties"); + + // nested + builder.startObject("nested"); + builder.field("type", "nested"); + builder.startObject("properties"); + + // nested.field + builder.startObject("field"); + builder.field("type", "keyword"); + builder.endObject(); + + // nested.emb + builder.startObject("emb"); + minimalMapping(builder); + builder.endObject(); + + // double_nested + builder.startObject("double_nested"); + builder.field("type", "nested"); + builder.startObject("properties"); + + // double_nested.field + builder.startObject("field"); + builder.field("type", "keyword"); + builder.endObject(); + + // double_nested.emb + builder.startObject("emb"); + minimalMapping(builder); + builder.endObject(); + + builder.endObject(); // double_nested.properties + builder.endObject(); // double_nested + + builder.endObject(); // nested.properties + builder.endObject(); // nested + + builder.endObject(); // obj.properties + builder.endObject(); // obj + + builder.endObject(); // properties + builder.endObject(); // _doc + builder.endObject(); // root + + return Strings.toString(builder); + } + } + + private BytesReference generateRandomDoc( + XContentType xContentType, + boolean includeRootField, + boolean includeVector, + boolean includeNested, + boolean includeDoubleNested, + boolean includeUnmapped + ) throws IOException { + try (var builder = XContentBuilder.builder(xContentType.xContent())) { + builder.startObject(); + + if (includeRootField) { + builder.field("field", randomAlphaOfLengthBetween(1, 2)); + } + + if (includeVector) { + builder.field("emb", getSampleValueForDocument()); + // builder.array("emb", new float[] { 1, 2, 3 }); + } + + if (includeUnmapped) { + builder.field("unmapped_field", "extra"); + } + + builder.startObject("obj"); + if (includeNested) { + builder.startArray("nested"); + + // Entry with just a field + builder.startObject(); + builder.field("field", randomAlphaOfLengthBetween(3, 6)); + builder.endObject(); + + // Empty object + builder.startObject(); + builder.endObject(); + + // Entry with emb and double_nested + if (includeDoubleNested) { + builder.startObject(); + // builder.array("emb", new float[] { 1, 2, 3 }); + builder.field("emb", getSampleValueForDocument()); + builder.field("field", "nested_val"); + builder.startArray("double_nested"); + for (int i = 0; i < 2; i++) { + builder.startObject(); + // builder.array("emb", new float[] { 1, 2, 3 }); + builder.field("emb", getSampleValueForDocument()); + builder.field("field", "dn_field"); + builder.endObject(); + } + builder.endArray(); + builder.endObject(); + } + + builder.endArray(); + } + builder.endObject(); + + builder.endObject(); + return BytesReference.bytes(builder); + } + } + + private BytesReference generateRandomDocWithFlatPath(XContentType xContentType) throws IOException { + try (var builder = XContentBuilder.builder(xContentType.xContent())) { + builder.startObject(); + + // Root-level fields + builder.field("field", randomAlphaOfLengthBetween(1, 2)); + builder.field("emb", getSampleValueForDocument()); + builder.field("another_field", randomAlphaOfLengthBetween(3, 5)); + + // Simulated flattened "obj.nested" + builder.startObject("obj.nested"); + + builder.field("field", randomAlphaOfLengthBetween(4, 8)); + builder.field("emb", getSampleValueForDocument()); + + builder.startArray("double_nested"); + for (int i = 0; i < randomIntBetween(1, 2); i++) { + builder.startObject(); + builder.field("field", randomAlphaOfLengthBetween(4, 8)); + builder.field("emb", getSampleValueForDocument()); + builder.endObject(); + } + builder.endArray(); + + builder.endObject(); // end obj.nested + + builder.endObject(); + return BytesReference.bytes(builder); + } + } + + private void assertSyntheticVectors(String mapping, BytesReference source, XContentType xContentType) throws IOException { + var settings = Settings.builder().put(IndexSettings.INDEX_MAPPING_SOURCE_SYNTHETIC_VECTORS_SETTING.getKey(), true).build(); + MapperService mapperService = createMapperService(settings, mapping); + var parsedDoc = mapperService.documentMapper().parse(new SourceToParse("0", source, xContentType)); + try (var directory = newDirectory()) { + IndexWriterConfig config = newIndexWriterConfig(random(), new StandardAnalyzer()); + try (var iw = new RandomIndexWriter(random(), directory, config)) { + parsedDoc.updateSeqID(0, 1); + parsedDoc.version().setLongValue(0); + iw.addDocuments(parsedDoc.docs()); + } + try (var indexReader = wrapInMockESDirectoryReader(DirectoryReader.open(directory))) { + var provider = SourceProvider.fromLookup( + mapperService.mappingLookup(), + null, + mapperService.getMapperMetrics().sourceFieldMetrics() + ); + var searchSource = provider.getSource(indexReader.leaves().get(0), parsedDoc.docs().size() - 1); + assertToXContentEquivalent(source, searchSource.internalSourceRef(), xContentType); + } + } + } +} diff --git a/x-pack/plugin/rank-vectors/src/main/java/org/elasticsearch/xpack/rank/vectors/RankVectorsPlugin.java b/x-pack/plugin/rank-vectors/src/main/java/org/elasticsearch/xpack/rank/vectors/RankVectorsPlugin.java index 35c87f1fc1847..dd38367125692 100644 --- a/x-pack/plugin/rank-vectors/src/main/java/org/elasticsearch/xpack/rank/vectors/RankVectorsPlugin.java +++ b/x-pack/plugin/rank-vectors/src/main/java/org/elasticsearch/xpack/rank/vectors/RankVectorsPlugin.java @@ -20,6 +20,7 @@ import java.util.Map; +import static org.elasticsearch.index.IndexSettings.INDEX_MAPPING_SOURCE_SYNTHETIC_VECTORS_SETTING; import static org.elasticsearch.index.mapper.FieldMapper.notInMultiFields; import static org.elasticsearch.xpack.rank.vectors.mapper.RankVectorsFieldMapper.CONTENT_TYPE; @@ -36,7 +37,12 @@ public Map getMappers() { if (RANK_VECTORS_FEATURE.check(getLicenseState()) == false) { throw LicenseUtils.newComplianceException("Rank Vectors"); } - return new RankVectorsFieldMapper.Builder(n, c.indexVersionCreated(), getLicenseState()); + return new RankVectorsFieldMapper.Builder( + n, + c.indexVersionCreated(), + getLicenseState(), + INDEX_MAPPING_SOURCE_SYNTHETIC_VECTORS_SETTING.get(c.getIndexSettings().getSettings()) + ); }, notInMultiFields(CONTENT_TYPE))); } diff --git a/x-pack/plugin/rank-vectors/src/main/java/org/elasticsearch/xpack/rank/vectors/mapper/RankVectorsFieldMapper.java b/x-pack/plugin/rank-vectors/src/main/java/org/elasticsearch/xpack/rank/vectors/mapper/RankVectorsFieldMapper.java index 7c536640f1f95..4dd8cefc0115c 100644 --- a/x-pack/plugin/rank-vectors/src/main/java/org/elasticsearch/xpack/rank/vectors/mapper/RankVectorsFieldMapper.java +++ b/x-pack/plugin/rank-vectors/src/main/java/org/elasticsearch/xpack/rank/vectors/mapper/RankVectorsFieldMapper.java @@ -10,6 +10,7 @@ import org.apache.lucene.document.BinaryDocValuesField; import org.apache.lucene.index.BinaryDocValues; import org.apache.lucene.index.LeafReader; +import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.search.FieldExistsQuery; import org.apache.lucene.search.Query; import org.apache.lucene.util.BytesRef; @@ -112,11 +113,18 @@ public static class Builder extends FieldMapper.Builder { private final IndexVersion indexCreatedVersion; private final XPackLicenseState licenseState; + private final boolean isSyntheticVector; - public Builder(String name, IndexVersion indexCreatedVersion, XPackLicenseState licenseState) { + public Builder(String name, IndexVersion indexCreatedVersion, XPackLicenseState licenseState, boolean isSyntheticVector) { super(name); this.indexCreatedVersion = indexCreatedVersion; this.licenseState = licenseState; + this.isSyntheticVector = isSyntheticVector; + } + + public Builder dimensions(int dimensions) { + this.dims.setValue(dimensions); + return this; } @Override @@ -133,6 +141,7 @@ public RankVectorsFieldMapper build(MapperBuilderContext context) { // Validate again here because the dimensions or element type could have been set programmatically, // which affects index option validity validate(); + boolean isSyntheticVectorFinal = context.isSourceSynthetic() == false && isSyntheticVector; return new RankVectorsFieldMapper( leafName(), new RankVectorsFieldType( @@ -144,7 +153,8 @@ public RankVectorsFieldMapper build(MapperBuilderContext context) { ), builderParams(this, context), indexCreatedVersion, - licenseState + licenseState, + isSyntheticVectorFinal ); } } @@ -242,17 +252,20 @@ DenseVectorFieldMapper.ElementType getElementType() { private final IndexVersion indexCreatedVersion; private final XPackLicenseState licenseState; + private final boolean isSyntheticVector; private RankVectorsFieldMapper( String simpleName, MappedFieldType fieldType, BuilderParams params, IndexVersion indexCreatedVersion, - XPackLicenseState licenseState + XPackLicenseState licenseState, + boolean isSyntheticVector ) { super(simpleName, fieldType, params); this.indexCreatedVersion = indexCreatedVersion; this.licenseState = licenseState; + this.isSyntheticVector = isSyntheticVector; } @Override @@ -299,14 +312,9 @@ public void parse(DocumentParserContext context) throws IOException { ); } } - RankVectorsFieldType updatedFieldType = new RankVectorsFieldType( - fieldType().name(), - fieldType().elementType, - currentDims, - licenseState, - fieldType().meta() - ); - Mapper update = new RankVectorsFieldMapper(leafName(), updatedFieldType, builderParams, indexCreatedVersion, licenseState); + var builder = (Builder) getMergeBuilder(); + builder.dimensions(currentDims); + Mapper update = builder.build(context.createDynamicMapperBuilderContext()); context.addDynamicMapper(update); return; } @@ -388,7 +396,7 @@ protected String contentType() { @Override public FieldMapper.Builder getMergeBuilder() { - return new Builder(leafName(), indexCreatedVersion, licenseState).init(this); + return new Builder(leafName(), indexCreatedVersion, licenseState, isSyntheticVector).init(this); } @Override @@ -396,6 +404,11 @@ protected SyntheticSourceSupport syntheticSourceSupport() { return new SyntheticSourceSupport.Native(DocValuesSyntheticFieldLoader::new); } + @Override + public SourceLoader.SyntheticVectorsLoader syntheticVectorsLoader() { + return isSyntheticVector ? new SyntheticRankVectorPatchLoader(new DocValuesSyntheticFieldLoader()) : null; + } + private class DocValuesSyntheticFieldLoader extends SourceLoader.DocValuesBasedSyntheticFieldLoader { private BinaryDocValues values; private boolean hasValue; @@ -407,8 +420,10 @@ public DocValuesLoader docValuesLoader(LeafReader leafReader, int[] docIdsInLeaf return null; } return docId -> { - hasValue = docId == values.advance(docId); - return hasValue; + if (values.docID() > docId) { + return hasValue = false; + } + return hasValue = values.docID() == docId || values.advance(docId) == docId; }; } @@ -440,9 +455,65 @@ public void write(XContentBuilder b) throws IOException { b.endArray(); } + private Object copyVectorsAsList() throws IOException { + assert hasValue : "rank vector is null"; + BytesRef ref = values.binaryValue(); + ByteBuffer byteBuffer = ByteBuffer.wrap(ref.bytes, ref.offset, ref.length).order(ByteOrder.LITTLE_ENDIAN); + assert ref.length % fieldType().elementType.getNumBytes(fieldType().dims) == 0; + int numVecs = ref.length / fieldType().elementType.getNumBytes(fieldType().dims); + List> vectors = new ArrayList<>(numVecs); + for (int i = 0; i < numVecs; i++) { + int dims = fieldType().elementType == DenseVectorFieldMapper.ElementType.BIT + ? fieldType().dims / Byte.SIZE + : fieldType().dims; + + switch (fieldType().elementType) { + case FLOAT -> { + List vec = new ArrayList<>(dims); + for (int dim = 0; dim < dims; dim++) { + vec.add(byteBuffer.getFloat()); + } + vectors.add(vec); + } + case BYTE, BIT -> { + List vec = new ArrayList<>(dims); + for (int dim = 0; dim < dims; dim++) { + vec.add(byteBuffer.get()); + } + vectors.add(vec); + } + } + } + return vectors; + } + @Override public String fieldName() { return fullPath(); } } + + private class SyntheticRankVectorPatchLoader implements SourceLoader.SyntheticVectorsLoader { + private final DocValuesSyntheticFieldLoader syntheticFieldLoader; + + private SyntheticRankVectorPatchLoader(DocValuesSyntheticFieldLoader syntheticFieldLoader) { + this.syntheticFieldLoader = syntheticFieldLoader; + } + + @Override + public SourceLoader.SyntheticVectorsLoader.Leaf leaf(LeafReaderContext context) throws IOException { + var dvLoader = syntheticFieldLoader.docValuesLoader(context.reader(), null); + return (doc, acc) -> { + if (dvLoader == null) { + return; + } + if (dvLoader.advanceToDoc(doc) && syntheticFieldLoader.hasValue()) { + // add vectors as list since that's how they're parsed from xcontent. + acc.add( + new SourceLoader.LeafSyntheticVectorPath(syntheticFieldLoader.fieldName(), syntheticFieldLoader.copyVectorsAsList()) + ); + } + }; + } + } } diff --git a/x-pack/plugin/rank-vectors/src/test/java/org/elasticsearch/xpack/rank/vectors/mapper/RankVectorsFieldMapperTests.java b/x-pack/plugin/rank-vectors/src/test/java/org/elasticsearch/xpack/rank/vectors/mapper/RankVectorsFieldMapperTests.java index 69f95ae4bf52f..6f80d7b46f563 100644 --- a/x-pack/plugin/rank-vectors/src/test/java/org/elasticsearch/xpack/rank/vectors/mapper/RankVectorsFieldMapperTests.java +++ b/x-pack/plugin/rank-vectors/src/test/java/org/elasticsearch/xpack/rank/vectors/mapper/RankVectorsFieldMapperTests.java @@ -21,11 +21,11 @@ import org.elasticsearch.index.mapper.MappedFieldType; import org.elasticsearch.index.mapper.MapperParsingException; import org.elasticsearch.index.mapper.MapperService; -import org.elasticsearch.index.mapper.MapperTestCase; import org.elasticsearch.index.mapper.ParsedDocument; import org.elasticsearch.index.mapper.SourceToParse; import org.elasticsearch.index.mapper.ValueFetcher; import org.elasticsearch.index.mapper.vectors.DenseVectorFieldMapper.ElementType; +import org.elasticsearch.index.mapper.vectors.SyntheticVectorsMapperTestCase; import org.elasticsearch.index.query.SearchExecutionContext; import org.elasticsearch.plugins.Plugin; import org.elasticsearch.search.lookup.Source; @@ -46,20 +46,24 @@ import java.util.Set; import java.util.stream.Stream; +import static org.apache.lucene.tests.index.BaseKnnVectorsFormatTestCase.randomNormalizedVector; +import static org.elasticsearch.index.mapper.vectors.DenseVectorFieldMapperTests.convertToList; import static org.hamcrest.Matchers.containsString; import static org.hamcrest.Matchers.equalTo; import static org.hamcrest.Matchers.instanceOf; import static org.mockito.Mockito.mock; import static org.mockito.Mockito.when; -public class RankVectorsFieldMapperTests extends MapperTestCase { +public class RankVectorsFieldMapperTests extends SyntheticVectorsMapperTestCase { private final ElementType elementType; private final int dims; public RankVectorsFieldMapperTests() { this.elementType = randomFrom(ElementType.BYTE, ElementType.FLOAT, ElementType.BIT); - this.dims = ElementType.BIT == elementType ? 4 * Byte.SIZE : 4; + int baseDims = ElementType.BIT == elementType ? 4 * Byte.SIZE : 4; + int randomMultiplier = ElementType.FLOAT == elementType ? randomIntBetween(1, 64) : 1; + this.dims = baseDims * randomMultiplier; } @Override @@ -88,7 +92,9 @@ private void indexMapping(XContentBuilder b, IndexVersion indexVersion) throws I protected Object getSampleValueForDocument() { int numVectors = randomIntBetween(1, 16); return Stream.generate( - () -> elementType == ElementType.FLOAT ? List.of(0.5, 0.5, 0.5, 0.5) : List.of((byte) 1, (byte) 1, (byte) 1, (byte) 1) + () -> elementType == ElementType.FLOAT + ? convertToList(randomNormalizedVector(this.dims)) + : convertToList(randomByteArrayOfLength(elementType == ElementType.BIT ? this.dims / Byte.SIZE : dims)) ).limit(numVectors).toList(); } diff --git a/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/rank_vectors/rank_vectors_synthetic_vectors.yml b/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/rank_vectors/rank_vectors_synthetic_vectors.yml new file mode 100644 index 0000000000000..c0df9d6a79d38 --- /dev/null +++ b/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/rank_vectors/rank_vectors_synthetic_vectors.yml @@ -0,0 +1,357 @@ +setup: + - requires: + reason: 'synthetic vectors and rank_vectors are required' + cluster_features: [ "rank_vectors" ] + test_runner_features: [ capabilities ] + capabilities: + - method: GET + path: /_search + capabilities: [ synthetic_vectors_setting ] + - skip: + features: "headers" + + - do: + indices.create: + index: test + body: + settings: + index.mapping.synthetic_vectors: true + mappings: + properties: + name: + type: keyword + vector: + type: rank_vectors + dims: 3 + + nested: + type: nested + properties: + paragraph_id: + type: keyword + vector: + type: rank_vectors + dims: 3 + + - do: + index: + index: test + id: "1" + body: + name: cow.jpg + vector: [[1, 2, 3], [4, 5, 6]] + + - do: + index: + index: test + id: "2" + body: + name: moose.jpg + nested: + - paragraph_id: 0 + vector: [[1, 2, 3], [4, 5, 6]] + - paragraph_id: 2 + vector: [[4, 5, 6], [7, 8, 9]] + - paragraph_id: 3 + vector: [[7, 8, 9], [10, 11, 12], [13, 14, 15]] + + - do: + index: + index: test + id: "3" + body: + name: rabbit.jpg + vector: [[10, 11, 12]] + + - do: + index: + index: test + id: "4" + body: + name: zoolander.jpg + nested: + - paragraph_id: 0 + vector: [[13, 14, 15], [16, 17, 18]] + - paragraph_id: 1 + - paragraph_id: 2 + vector: [[16, 17, 18]] + + - do: + indices.refresh: {} + +--- +"exclude synthetic vectors": + - do: + search: + index: test + body: + sort: ["name"] + + - match: { hits.hits.0._id: "1"} + - match: { hits.hits.0._source.name: "cow.jpg"} + - not_exists: hits.hits.0._source.vector + + - match: { hits.hits.1._id: "2"} + - match: { hits.hits.1._source.name: "moose.jpg"} + - length: { hits.hits.1._source.nested: 3 } + - not_exists: hits.hits.1._source.nested.0.vector + - match: { hits.hits.1._source.nested.0.paragraph_id: 0 } + - not_exists: hits.hits.1._source.nested.1.vector + - match: { hits.hits.1._source.nested.1.paragraph_id: 2 } + - not_exists: hits.hits.1._source.nested.2.vector + - match: { hits.hits.1._source.nested.2.paragraph_id: 3 } + + - match: { hits.hits.2._id: "3" } + - match: { hits.hits.2._source.name: "rabbit.jpg" } + - not_exists: hits.hits.2._source.vector + + - match: { hits.hits.3._id: "4" } + - match: { hits.hits.3._source.name: "zoolander.jpg" } + - length: { hits.hits.3._source.nested: 3 } + - not_exists: hits.hits.3._source.nested.0.vector + - match: { hits.hits.3._source.nested.0.paragraph_id: 0 } + - match: { hits.hits.3._source.nested.1.paragraph_id: 1 } + - not_exists: hits.hits.3._source.nested.2.vector + - match: { hits.hits.3._source.nested.2.paragraph_id: 2 } + +--- +"include synthetic vectors": + - do: + search: + index: test + body: + _source: + exclude_vectors: false + sort: ["name"] + + - match: { hits.hits.0._id: "1"} + - match: { hits.hits.0._source.name: "cow.jpg"} + - exists: hits.hits.0._source.vector + + - match: { hits.hits.1._id: "2"} + - match: { hits.hits.1._source.name: "moose.jpg"} + - length: { hits.hits.1._source.nested: 3 } + - exists: hits.hits.1._source.nested.0.vector + - match: { hits.hits.1._source.nested.0.paragraph_id: 0 } + - exists: hits.hits.1._source.nested.1.vector + - match: { hits.hits.1._source.nested.1.paragraph_id: 2 } + - exists: hits.hits.1._source.nested.2.vector + - match: { hits.hits.1._source.nested.2.paragraph_id: 3 } + + - match: { hits.hits.2._id: "3" } + - match: { hits.hits.2._source.name: "rabbit.jpg" } + - exists: hits.hits.2._source.vector + + - match: { hits.hits.3._id: "4" } + - match: { hits.hits.3._source.name: "zoolander.jpg" } + - length: { hits.hits.3._source.nested: 3 } + - exists: hits.hits.3._source.nested.0.vector + - length: { hits.hits.3._source.nested.0.vector: 2 } + - length: { hits.hits.3._source.nested.0.vector.0: 3 } + - length: { hits.hits.3._source.nested.0.vector.1: 3 } + - match: { hits.hits.3._source.nested.0.paragraph_id: 0 } + + - do: + search: + index: test + body: + _source: + exclude_vectors: false + includes: nested.vector + sort: ["name"] + + - match: { hits.hits.0._id: "1"} + - length: { hits.hits.0._source: 0} + + - match: { hits.hits.1._id: "2"} + - length: { hits.hits.3._source: 1 } + - length: { hits.hits.1._source.nested: 3 } + - exists: hits.hits.1._source.nested.0.vector + - not_exists: hits.hits.1._source.nested.0.paragraph_id + - exists: hits.hits.1._source.nested.1.vector + - not_exists: hits.hits.1._source.nested.1.paragraph_id + - exists: hits.hits.1._source.nested.2.vector + - not_exists: hits.hits.1._source.nested.2.paragraph_id + + - match: { hits.hits.2._id: "3" } + - length: { hits.hits.2._source: 0} + + - match: { hits.hits.3._id: "4" } + - length: { hits.hits.3._source: 1 } + - length: { hits.hits.3._source.nested: 2 } + - exists: hits.hits.3._source.nested.0.vector + - length: { hits.hits.3._source.nested.0.vector: 2 } + - length: { hits.hits.3._source.nested.0.vector.0: 3 } + - length: { hits.hits.3._source.nested.0.vector.1: 3 } + - not_exists: hits.hits.3._source.nested.0.paragraph_id + - exists: hits.hits.3._source.nested.1.vector + - length: { hits.hits.3._source.nested.1.vector: 1 } + - length: { hits.hits.3._source.nested.1.vector.0: 3 } + - not_exists: hits.hits.3._source.nested.1.paragraph_id + + - do: + headers: + # Force JSON content type so that we use a parser that interprets the embeddings as doubles + Content-Type: application/json + search: + index: test + body: + _source: + exclude_vectors: true + sort: ["name"] + fields: ["vector"] + + - match: { hits.hits.0._id: "1"} + - match: { hits.hits.0._source.name: "cow.jpg"} + - not_exists: hits.hits.0._source.vector + - match: { hits.hits.0.fields.vector: [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]} + + - match: { hits.hits.1._id: "2"} + - match: { hits.hits.1._source.name: "moose.jpg"} + - length: { hits.hits.1._source.nested: 3 } + - not_exists: hits.hits.1._source.nested.0.vector + + - match: { hits.hits.2._id: "3" } + - match: { hits.hits.2._source.name: "rabbit.jpg" } + - match: { hits.hits.2.fields.vector: [[10.0, 11.0, 12.0]]} + + - match: { hits.hits.3._id: "4" } + - match: { hits.hits.3._source.name: "zoolander.jpg" } + - length: { hits.hits.3._source.nested: 3 } + - not_exists: hits.hits.3._source.nested.0.vector + + +--- +"Bulk partial update with synthetic vectors": + - do: + headers: + # Force JSON content type so that we use a parser that interprets the embeddings as doubles + Content-Type: application/json + bulk: + index: test + _source: true + body: + - '{"update": {"_id": "4"}}' + - > + { + "doc": { + "name": "zoolander2.jpg", + "vector": [[1, 2, 4], [3, 6, 9]] + } + } + + - match: { items.0.update.get._source.vector: [[1, 2, 4], [3, 6, 9]] } + - exists: items.0.update.get._source.nested + - length: { items.0.update.get._source.nested: 3} + - exists: items.0.update.get._source.nested.0.vector + - match: { items.0.update.get._source.nested.0.paragraph_id: 0 } + - length: { items.0.update.get._source.nested.0.vector: 2 } + - length: { items.0.update.get._source.nested.0.vector.0: 3 } + - length: { items.0.update.get._source.nested.0.vector.1: 3 } + - not_exists: items.0.update.get._source.nested.1.vector + - match: { items.0.update.get._source.nested.1.paragraph_id: 1 } + - exists: items.0.update.get._source.nested.2.vector + - length: { items.0.update.get._source.nested.2.vector: 1 } + - length: { items.0.update.get._source.nested.2.vector.0: 3 } + - match: { items.0.update.get._source.nested.2.paragraph_id: 2 } + - set: { items.0.update.get._source.nested: original_nested } + + - do: + headers: + # Force JSON content type so that we use a parser that interprets the embeddings as doubles + Content-Type: application/json + get: + _source_exclude_vectors: false + index: test + id: "4" + + - match: { _source.vector: [[1.0, 2.0, 4.0], [3.0, 6.0, 9.0]] } + - match: { _source.name: zoolander2.jpg } + - match: { _source.nested: $original_nested } + + - do: + indices.refresh: {} + + - do: + headers: + # Force JSON content type so that we use a parser that interprets the embeddings as doubles + Content-Type: application/json + search: + index: test + body: + _source: + "exclude_vectors": false + query: + term: + _id: 4 + + - match: { hits.total.value: 1 } + - match: { hits.total.relation: eq } + - match: { hits.hits.0._source.name: zoolander2.jpg } + - match: { hits.hits.0._source.nested: $original_nested } + +--- +"Partial update with synthetic vectors": + - do: + headers: + # Force JSON content type so that we use a parser that interprets the vectors as doubles + Content-Type: application/json + update: + index: test + id: "4" + body: + _source: true + doc: { + "name": "zoolander3.jpg", + "vector": [[6, 8, 9]] + } + + - match: { get._source.vector: [[6, 8, 9]] } + - exists: get._source.nested + - length: { get._source.nested: 3} + - exists: get._source.nested.0.vector + - match: { get._source.nested.0.paragraph_id: 0 } + - length: { get._source.nested.0.vector: 2 } + - length: { get._source.nested.0.vector.0: 3 } + - length: { get._source.nested.0.vector.1: 3 } + - not_exists: get._source.nested.1.vector + - match: { get._source.nested.1.paragraph_id: 1 } + - exists: get._source.nested.2.vector + - length: { get._source.nested.2.vector: 1 } + - length: { get._source.nested.2.vector.0: 3 } + - match: { get._source.nested.2.paragraph_id: 2 } + - set: { get._source.nested: original_nested } + + - do: + headers: + # Force JSON content type so that we use a parser that interprets the vectors as doubles + Content-Type: application/json + get: + _source_exclude_vectors: false + index: test + id: "4" + + - match: { _source.vector: [[6.0, 8.0, 9.0]] } + - match: { _source.name: zoolander3.jpg } + - match: { _source.nested: $original_nested } + + - do: + indices.refresh: {} + + - do: + headers: + # Force JSON content type so that we use a parser that interprets the vectors as doubles + Content-Type: application/json + search: + index: test + body: + _source: + "exclude_vectors": false + query: + term: + _id: 4 + + - match: { hits.total.value: 1 } + - match: { hits.total.relation: eq } + - match: { hits.hits.0._source.name: zoolander3.jpg } + - match: { hits.hits.0._source.nested: $original_nested }