Skip to content

Commit 854c520

Browse files
iveraseweizijun
andauthored
[9.1][Vector Search] BugFix wrong vector docvalue_fields (elastic#138342)
* [9.1][Vector Search] BugFix wrong vector docvalue_fields * fix backport --------- Co-authored-by: weizijun <[email protected]>
1 parent 6c1a0d3 commit 854c520

File tree

5 files changed

+282
-1
lines changed

5 files changed

+282
-1
lines changed

docs/changelog/137862.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
pr: 137862
2+
summary: "[Vector Search] Fix wrong vector docvalue_fields"
3+
area: Vector Search
4+
type: bug
5+
issues: []

rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search.vectors/200_dense_vector_docvalue_fields.yml

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -161,3 +161,77 @@ setup:
161161
- close_to: { hits.hits.2.fields.vector2.0.4: { value: -100.0, error: 0.001 } }
162162

163163
- match: {hits.hits.2.fields.vector3.0: [-1, 100, -13, 15, -128]}
164+
165+
---
166+
"Dense vector cosine docvalue_fields with sparse documents after force merge":
167+
- requires:
168+
cluster_features: [ "mapper.fix_dense_vector_wrong_fields" ]
169+
reason: Support for dense vector doc value fields capability required
170+
- do:
171+
indices.create:
172+
index: test_cosine
173+
body:
174+
mappings:
175+
properties:
176+
foo:
177+
type: keyword
178+
vec:
179+
type: dense_vector
180+
similarity: cosine
181+
182+
# Create sparse vector scenario - some docs have vectors, some don't
183+
- do:
184+
index:
185+
index: test_cosine
186+
id: "1"
187+
body:
188+
foo: "bar"
189+
190+
- do:
191+
index:
192+
index: test_cosine
193+
id: "2"
194+
body:
195+
foo: "bar"
196+
197+
- do:
198+
index:
199+
index: test_cosine
200+
id: "3"
201+
body:
202+
vec: [1, 2]
203+
204+
- do:
205+
indices.refresh:
206+
index: test_cosine
207+
208+
- do:
209+
index:
210+
index: test_cosine
211+
id: "4"
212+
body:
213+
foo: "bar"
214+
215+
# Force merge to create the ord/docId mapping issue scenario
216+
- do:
217+
indices.forcemerge:
218+
index: test_cosine
219+
max_num_segments: 1
220+
221+
# Search with docvalue_fields to get the vector values
222+
- do:
223+
search:
224+
index: test_cosine
225+
body:
226+
docvalue_fields: ["vec"]
227+
query:
228+
term:
229+
_id: "3"
230+
231+
# Verify that _id=3's vector value is correctly returned as [1, 2]
232+
# This test verifies that the ordToDoc fix works correctly
233+
- match: { hits.total.value: 1 }
234+
- match: { hits.hits.0._id: "3" }
235+
- length: { hits.hits.0.fields.vec.0: 2 }
236+
- close_to: { hits.hits.0.fields.vec.0.0: { value: 1.0, error: 0.001 } }
237+
- close_to: { hits.hits.0.fields.vec.0.1: { value: 2.0, error: 0.001 } }

server/src/main/java/org/elasticsearch/index/mapper/MapperFeatures.java

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ public class MapperFeatures implements FeatureSpecification {
4949
static final NodeFeature SEARCH_LOAD_PER_SHARD = new NodeFeature("mapper.search_load_per_shard");
5050
static final NodeFeature PATTERNED_TEXT = new NodeFeature("mapper.patterned_text");
5151
public static final NodeFeature MULTI_FIELD_UNICODE_OPTIMISATION_FIX = new NodeFeature("mapper.multi_field.unicode_optimisation_fix");
52+
public static final NodeFeature FIX_DENSE_VECTOR_WRONG_FIELDS = new NodeFeature("mapper.fix_dense_vector_wrong_fields");
5253

5354
@Override
5455
public Set<NodeFeature> getTestFeatures() {
@@ -83,7 +84,8 @@ public Set<NodeFeature> getTestFeatures() {
8384
SPARSE_VECTOR_INDEX_OPTIONS_FEATURE,
8485
PATTERNED_TEXT,
8586
MULTI_FIELD_UNICODE_OPTIMISATION_FIX,
86-
MATCH_ONLY_TEXT_BLOCK_LOADER_FIX
87+
MATCH_ONLY_TEXT_BLOCK_LOADER_FIX,
88+
FIX_DENSE_VECTOR_WRONG_FIELDS
8789
);
8890
}
8991
}

server/src/main/java/org/elasticsearch/index/mapper/vectors/DenormalizedCosineFloatVectorValues.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,11 @@ public VectorScorer scorer(float[] floats) throws IOException {
5959
return in.scorer(floats);
6060
}
6161

62+
@Override
63+
public int ordToDoc(int ord) {
64+
return in.ordToDoc(ord);
65+
}
66+
6267
public float magnitude() {
6368
return magnitude;
6469
}

server/src/test/java/org/elasticsearch/index/mapper/vectors/DenormalizedCosineFloatVectorValuesTests.java

Lines changed: 195 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,10 @@
99

1010
package org.elasticsearch.index.mapper.vectors;
1111

12+
import org.apache.lucene.index.FloatVectorValues;
1213
import org.apache.lucene.index.KnnVectorValues;
1314
import org.apache.lucene.index.NumericDocValues;
15+
import org.apache.lucene.search.VectorScorer;
1416
import org.elasticsearch.test.ESTestCase;
1517

1618
import java.io.IOException;
@@ -114,4 +116,197 @@ public long cost() {
114116
};
115117
}
116118

119+
public void testOrdToDocWithSparseVectors() throws IOException {
120+
// This test simulates a real-world scenario where some documents don't have vector fields
121+
// After force merge, the ord (ordinal) and docId mapping becomes crucial
122+
123+
// Simulate a scenario where we have 6 documents, but only documents 1, 3, 4 have vectors
124+
// Doc 0: no vector
125+
// Doc 1: vector [0.6, 0.8, 0.0, 0.0], magnitude 5.0 -> original [3.0, 4.0, 0.0, 0.0]
126+
// Doc 2: no vector
127+
// Doc 3: vector [1.0, 0.0, 0.0, 0.0], magnitude 2.0 -> original [2.0, 0.0, 0.0, 0.0]
128+
// Doc 4: vector [0.0, 0.0, 0.6, 0.8], magnitude 10.0 -> original [0.0, 0.0, 6.0, 8.0]
129+
// Doc 5: no vector
130+
131+
// After merge, the vector ordinals will be 0, 1, 2 but they correspond to docIds 1, 3, 4
132+
int totalDocs = 6;
133+
int[] docIdsWithVectors = { 1, 3, 4 }; // Document IDs that have vectors
134+
int numVectors = docIdsWithVectors.length;
135+
136+
float[][] normalizedVectors = new float[numVectors][];
137+
float[] magnitudes = new float[numVectors];
138+
139+
normalizedVectors[0] = new float[] { 0.6f, 0.8f, 0.0f, 0.0f }; // Doc 1
140+
magnitudes[0] = 5.0f;
141+
142+
normalizedVectors[1] = new float[] { 1.0f, 0.0f, 0.0f, 0.0f }; // Doc 3
143+
magnitudes[1] = 2.0f;
144+
145+
normalizedVectors[2] = new float[] { 0.0f, 0.0f, 0.6f, 0.8f }; // Doc 4
146+
magnitudes[2] = 10.0f;
147+
148+
// Expected original vectors after denormalization
149+
float[][] expectedVectors = new float[numVectors][];
150+
expectedVectors[0] = new float[] { 3.0f, 4.0f, 0.0f, 0.0f }; // Doc 1
151+
expectedVectors[1] = new float[] { 2.0f, 0.0f, 0.0f, 0.0f }; // Doc 3
152+
expectedVectors[2] = new float[] { 0.0f, 0.0f, 6.0f, 8.0f }; // Doc 4
153+
154+
// Create a custom FloatVectorValues that simulates post-merge sparse vector scenario
155+
FloatVectorValues sparseVectorValues = new FloatVectorValues() {
156+
@Override
157+
public int dimension() {
158+
return 4;
159+
}
160+
161+
@Override
162+
public int size() {
163+
return numVectors;
164+
}
165+
166+
@Override
167+
public DocIndexIterator iterator() {
168+
return new DocIndexIterator() {
169+
private int index = -1;
170+
171+
@Override
172+
public int docID() {
173+
return index;
174+
}
175+
176+
@Override
177+
public int index() {
178+
return index;
179+
}
180+
181+
@Override
182+
public int nextDoc() {
183+
return advance(index + 1);
184+
}
185+
186+
@Override
187+
public int advance(int target) {
188+
if (target >= numVectors) return NO_MORE_DOCS;
189+
return index = target;
190+
}
191+
192+
@Override
193+
public long cost() {
194+
return numVectors;
195+
}
196+
};
197+
}
198+
199+
@Override
200+
public FloatVectorValues copy() {
201+
throw new UnsupportedOperationException();
202+
}
203+
204+
@Override
205+
public VectorScorer scorer(float[] floats) {
206+
throw new UnsupportedOperationException();
207+
}
208+
209+
// This is the key method - it maps ordinals to actual document IDs
210+
@Override
211+
public int ordToDoc(int ord) {
212+
// ord 0 -> docId 1, ord 1 -> docId 3, ord 2 -> docId 4
213+
return docIdsWithVectors[ord];
214+
}
215+
216+
@Override
217+
public float[] vectorValue(int ord) {
218+
return normalizedVectors[ord];
219+
}
220+
};
221+
222+
// Create magnitudes that correspond to the actual document IDs
223+
NumericDocValues sparseMagnitudes = new NumericDocValues() {
224+
private int docId = -1;
225+
226+
@Override
227+
public long longValue() {
228+
// Find which vector index corresponds to this docId
229+
for (int i = 0; i < docIdsWithVectors.length; i++) {
230+
if (docIdsWithVectors[i] == docId) {
231+
return Float.floatToRawIntBits(magnitudes[i]);
232+
}
233+
}
234+
return Float.floatToRawIntBits(1.0f); // Default magnitude
235+
}
236+
237+
@Override
238+
public boolean advanceExact(int target) {
239+
docId = target;
240+
// Check if this docId has a vector
241+
for (int vectorDocId : docIdsWithVectors) {
242+
if (vectorDocId == target) {
243+
return true;
244+
}
245+
}
246+
return false;
247+
}
248+
249+
@Override
250+
public int docID() {
251+
return docId;
252+
}
253+
254+
@Override
255+
public int nextDoc() {
256+
return advance(docId + 1);
257+
}
258+
259+
@Override
260+
public int advance(int target) {
261+
for (int vectorDocId : docIdsWithVectors) {
262+
if (vectorDocId >= target) {
263+
docId = vectorDocId;
264+
return docId;
265+
}
266+
}
267+
return NO_MORE_DOCS;
268+
}
269+
270+
@Override
271+
public long cost() {
272+
return totalDocs;
273+
}
274+
};
275+
276+
// Test the fixed version (with ordToDoc)
277+
DenormalizedCosineFloatVectorValues vectorValues = new DenormalizedCosineFloatVectorValues(sparseVectorValues, sparseMagnitudes);
278+
279+
// Test that ordToDoc method properly maps ordinals to document IDs
280+
KnnVectorValues.DocIndexIterator iterator = vectorValues.iterator();
281+
282+
for (int ord = 0; ord < numVectors; ord++) {
283+
iterator.advance(ord);
284+
285+
// Verify that ordToDoc works correctly
286+
int expectedDocId = docIdsWithVectors[ord];
287+
int actualDocId = vectorValues.ordToDoc(ord);
288+
assertEquals("ordToDoc should correctly map ord " + ord + " to docId " + expectedDocId, expectedDocId, actualDocId);
289+
290+
// Get the denormalized vector - this relies on ordToDoc working correctly
291+
float[] actualVector = vectorValues.vectorValue(iterator.index());
292+
float actualMagnitude = vectorValues.magnitude();
293+
294+
// Verify the denormalized vector is correct
295+
assertArrayEquals(
296+
"Vector at ord " + ord + " (docId " + expectedDocId + ") should be correctly denormalized",
297+
expectedVectors[ord],
298+
actualVector,
299+
1e-6f
300+
);
301+
302+
// Verify the magnitude is correct
303+
assertEquals(
304+
"Magnitude at ord " + ord + " (docId " + expectedDocId + ") should be correct",
305+
magnitudes[ord],
306+
actualMagnitude,
307+
1e-6f
308+
);
309+
}
310+
}
311+
117312
}

0 commit comments

Comments
 (0)