From 668b95ca2c4a3a36ef0026dfc43f97b07045e354 Mon Sep 17 00:00:00 2001 From: Benjamin Trent Date: Wed, 12 Feb 2025 16:20:13 -0500 Subject: [PATCH] Fix synthetic source bug that would mishandle nested dense_vector fields (#122425) When utilizing synthetic source with nested fields, we attempt to rebuild the child values in addition to all the parent values. While this generally works well, its potential that certain values might be missing from various child docs. Consequently, we will attempt to iterate the vector values strangely, resulting in seemingly missing values or potentially exceptions indicating EOFs. closes: #122383 --- docs/changelog/122425.yaml | 5 + .../indices.create/20_synthetic_source.yml | 140 ++++++++++++++++++ .../vectors/DenseVectorFieldMapper.java | 18 +++ .../indices/CreateIndexCapabilities.java | 8 +- 4 files changed, 170 insertions(+), 1 deletion(-) create mode 100644 docs/changelog/122425.yaml diff --git a/docs/changelog/122425.yaml b/docs/changelog/122425.yaml new file mode 100644 index 0000000000000..a0e590dcdc36c --- /dev/null +++ b/docs/changelog/122425.yaml @@ -0,0 +1,5 @@ +pr: 122425 +summary: Fix synthetic source bug that would mishandle nested `dense_vector` fields +area: Mapping +type: bug +issues: [] diff --git a/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/indices.create/20_synthetic_source.yml b/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/indices.create/20_synthetic_source.yml index d1c492caf9b48..dc476147c9601 100644 --- a/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/indices.create/20_synthetic_source.yml +++ b/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/indices.create/20_synthetic_source.yml @@ -2008,3 +2008,143 @@ create index with use_synthetic_source: flush: false - gt: { test.store_size_in_bytes: 0 } - is_false: test.fields._recovery_source +--- +"Nested synthetic source with indexed dense vectors": + - requires: + test_runner_features: [ capabilities ] + capabilities: + - method: PUT + path: /{index} + capabilities: [ synthetic_nested_dense_vector_bug_fix ] + reason: "Requires synthetic source bugfix for dense vectors in nested objects" + - do: + indices.create: + index: nested_dense_vector_synthetic_test + body: + mappings: + properties: + parent: + type: nested + properties: + vector: + type: dense_vector + index: true + similarity: l2_norm + text: + type: text + settings: + index: + mapping: + source: + mode: synthetic + - do: + index: + index: nested_dense_vector_synthetic_test + id: 0 + refresh: true + body: { "parent": [ { "vector": [ 1, 2 ],"text": "foo" }, { "vector": [ 2, 2 ], "text": "bar" } ] } + + - do: + index: + index: nested_dense_vector_synthetic_test + id: 1 + refresh: true + body: { "parent": [ { "text": "foo" }, { "vector": [ 2, 2 ], "text": "bar" } ] } + + - do: + index: + index: nested_dense_vector_synthetic_test + id: 2 + refresh: true + body: { "parent": [ { "vector": [ 1, 2 ] }, { "vector": [ 2, 2 ], "text": "bar" } ] } + + + - do: + search: + index: nested_dense_vector_synthetic_test + body: + query: + match_all: {} + + - match: { hits.hits.0._source.parent.0.vector: [ 1.0, 2.0 ] } + - match: { hits.hits.0._source.parent.0.text: "foo" } + - match: { hits.hits.0._source.parent.1.vector: [ 2.0, 2.0 ] } + - match: { hits.hits.0._source.parent.1.text: "bar" } + - is_false: hits.hits.1._source.parent.0.vector + - match: { hits.hits.1._source.parent.0.text: "foo" } + - match: { hits.hits.1._source.parent.1.vector: [ 2.0, 2.0 ] } + - match: { hits.hits.1._source.parent.1.text: "bar" } + - match: {hits.hits.2._source.parent.0.vector: [ 1.0, 2.0 ] } + - is_false: hits.hits.2._source.parent.0.text + - match: { hits.hits.2._source.parent.1.vector: [ 2.0, 2.0 ] } + - match: { hits.hits.2._source.parent.1.text: "bar" } +--- +"Nested synthetic source with un-indexed dense vectors": + - requires: + test_runner_features: [ capabilities ] + capabilities: + - method: PUT + path: /{index} + capabilities: [ synthetic_nested_dense_vector_bug_fix ] + reason: "Requires synthetic source bugfix for dense vectors in nested objects" + - do: + indices.create: + index: nested_dense_vector_synthetic_test + body: + mappings: + properties: + parent: + type: nested + properties: + vector: + type: dense_vector + index: false + text: + type: text + settings: + index: + mapping: + source: + mode: synthetic + - do: + index: + index: nested_dense_vector_synthetic_test + id: 0 + refresh: true + body: { "parent": [ { "vector": [ 1, 2 ],"text": "foo" }, { "vector": [ 2, 2 ], "text": "bar" } ] } + + - do: + index: + index: nested_dense_vector_synthetic_test + id: 1 + refresh: true + body: { "parent": [ { "text": "foo" }, { "vector": [ 2, 2 ], "text": "bar" } ] } + + - do: + index: + index: nested_dense_vector_synthetic_test + id: 2 + refresh: true + body: { "parent": [ { "vector": [ 1, 2 ] }, { "vector": [ 2, 2 ], "text": "bar" } ] } + + + - do: + search: + index: nested_dense_vector_synthetic_test + body: + query: + match_all: {} + + - match: { hits.hits.0._source.parent.0.vector: [ 1.0, 2.0 ] } + - match: { hits.hits.0._source.parent.0.text: "foo" } + - match: { hits.hits.0._source.parent.1.vector: [ 2.0, 2.0 ] } + - match: { hits.hits.0._source.parent.1.text: "bar" } + - is_false: hits.hits.1._source.parent.0.vector + - match: { hits.hits.1._source.parent.0.text: "foo" } + - match: { hits.hits.1._source.parent.1.vector: [ 2.0, 2.0 ] } + - match: { hits.hits.1._source.parent.1.text: "bar" } + - match: {hits.hits.2._source.parent.0.vector: [ 1.0, 2.0 ] } + - is_false: hits.hits.2._source.parent.0.text + - match: { hits.hits.2._source.parent.1.vector: [ 2.0, 2.0 ] } + - match: { hits.hits.2._source.parent.1.text: "bar" } + diff --git a/server/src/main/java/org/elasticsearch/index/mapper/vectors/DenseVectorFieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/vectors/DenseVectorFieldMapper.java index 0d514408c912f..ce41c2164e205 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/vectors/DenseVectorFieldMapper.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/vectors/DenseVectorFieldMapper.java @@ -2404,6 +2404,12 @@ public DocValuesLoader docValuesLoader(LeafReader leafReader, int[] docIdsInLeaf } KnnVectorValues.DocIndexIterator iterator = values.iterator(); return docId -> { + if (iterator.docID() > docId) { + return hasValue = false; + } + if (iterator.docID() == docId) { + return hasValue = true; + } hasValue = docId == iterator.advance(docId); hasMagnitude = hasValue && magnitudeReader != null && magnitudeReader.advanceExact(docId); ord = iterator.index(); @@ -2414,6 +2420,12 @@ public DocValuesLoader docValuesLoader(LeafReader leafReader, int[] docIdsInLeaf if (byteVectorValues != null) { KnnVectorValues.DocIndexIterator iterator = byteVectorValues.iterator(); return docId -> { + if (iterator.docID() > docId) { + return hasValue = false; + } + if (iterator.docID() == docId) { + return hasValue = true; + } hasValue = docId == iterator.advance(docId); ord = iterator.index(); return hasValue; @@ -2476,6 +2488,12 @@ public DocValuesLoader docValuesLoader(LeafReader leafReader, int[] docIdsInLeaf return null; } return docId -> { + if (values.docID() > docId) { + return hasValue = false; + } + if (values.docID() == docId) { + return hasValue = true; + } hasValue = docId == values.advance(docId); return hasValue; }; diff --git a/server/src/main/java/org/elasticsearch/rest/action/admin/indices/CreateIndexCapabilities.java b/server/src/main/java/org/elasticsearch/rest/action/admin/indices/CreateIndexCapabilities.java index 9083c781ae167..334e68648d853 100644 --- a/server/src/main/java/org/elasticsearch/rest/action/admin/indices/CreateIndexCapabilities.java +++ b/server/src/main/java/org/elasticsearch/rest/action/admin/indices/CreateIndexCapabilities.java @@ -26,5 +26,11 @@ public class CreateIndexCapabilities { */ private static final String LOOKUP_INDEX_MODE_CAPABILITY = "lookup_index_mode"; - public static final Set CAPABILITIES = Set.of(LOGSDB_INDEX_MODE_CAPABILITY, LOOKUP_INDEX_MODE_CAPABILITY); + private static final String NESTED_DENSE_VECTOR_SYNTHETIC_TEST = "nested_dense_vector_synthetic_test"; + + public static final Set CAPABILITIES = Set.of( + LOGSDB_INDEX_MODE_CAPABILITY, + LOOKUP_INDEX_MODE_CAPABILITY, + NESTED_DENSE_VECTOR_SYNTHETIC_TEST + ); }