add full precision byte vector sim values source

vigyasharma · vigyasharma · commit 5edbd2bdb42d · 2025-05-24T00:32:43.000-07:00
diff --git a/lucene/core/src/java/org/apache/lucene/search/ByteVectorSimilarityValuesSource.java b/lucene/core/src/java/org/apache/lucene/search/ByteVectorSimilarityValuesSource.java
@@ -21,18 +21,51 @@
 import java.util.Arrays;
 import java.util.Objects;
 import org.apache.lucene.index.ByteVectorValues;
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.KnnVectorValues;
 import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.index.VectorEncoding;
+import org.apache.lucene.index.VectorSimilarityFunction;
 
 /**
  * A {@link DoubleValuesSource} which computes the vector similarity scores between the query vector
  * and the {@link org.apache.lucene.document.KnnByteVectorField} for documents.
  */
 class ByteVectorSimilarityValuesSource extends VectorSimilarityValuesSource {
+
+  /** Creates a {@link ByteVectorSimilarityValuesSource} that scores on full precision vector values */
+  public static DoubleValues fullPrecisionScores(
+      LeafReaderContext ctx, byte[] queryVector, String vectorField) throws IOException {
+    return new ByteVectorSimilarityValuesSource(queryVector, vectorField, true).getValues(ctx, null);
+  }
+
   private final byte[] queryVector;
+  private final boolean useFullPrecision;
 
+  /**
+   * Creates a {@link DoubleValuesSource} that returns vector similarity score between provided
+   * query vector and field for documents. Uses the scorer exposed by configured vectors reader.
+   *
+   * @param vector the query vector
+   * @param fieldName the field name of the {@link org.apache.lucene.document.KnnByteVectorField}
+   */
   public ByteVectorSimilarityValuesSource(byte[] vector, String fieldName) {
+    this(vector, fieldName, false);
+  }
+
+  /**
+   * Creates a {@link DoubleValuesSource} that returns vector similarity score between provided
+   * query vector and field for documents.
+   *
+   * @param vector the query vector
+   * @param fieldName the field name of the {@link org.apache.lucene.document.KnnByteVectorField}
+   * @param useFullPrecision uses full precision raw vectors for similarity computation if true,
+   *     otherwise the configured vectors reader is used, which may be quantized or full precision.
+   */
+  public ByteVectorSimilarityValuesSource(byte[] vector, String fieldName, boolean useFullPrecision) {
     super(fieldName);
     this.queryVector = vector;
+    this.useFullPrecision = useFullPrecision;
   }
 
   @Override
@@ -42,7 +75,43 @@ public VectorScorer getScorer(LeafReaderContext ctx) throws IOException {
       ByteVectorValues.checkField(ctx.reader(), fieldName);
       return null;
     }
-    return vectorValues.scorer(queryVector);
+
+    final FieldInfo fi = ctx.reader().getFieldInfos().fieldInfo(fieldName);
+    if (fi.getVectorEncoding() != VectorEncoding.BYTE) {
+      throw new IllegalArgumentException(
+          "Field "
+              + fieldName
+              + " does not have the expected vector encoding: "
+              + VectorEncoding.BYTE);
+    }
+    if (fi.getVectorDimension() != queryVector.length) {
+      throw new IllegalArgumentException(
+          "Query vector dimension does not match field dimension: "
+              + queryVector.length
+              + " != "
+              + fi.getVectorDimension());
+    }
+
+    // default vector scorer
+    if (useFullPrecision == false) {
+      return vectorValues.scorer(queryVector);
+    }
+
+    final VectorSimilarityFunction vectorSimilarityFunction = fi.getVectorSimilarityFunction();
+    return new VectorScorer() {
+      final KnnVectorValues.DocIndexIterator iterator = vectorValues.iterator();
+
+      @Override
+      public float score() throws IOException {
+        return vectorSimilarityFunction.compare(
+            queryVector, vectorValues.vectorValue(iterator.index()));
+      }
+
+      @Override
+      public DocIdSetIterator iterator() {
+        return iterator;
+      }
+    };
   }
 
   @Override
diff --git a/lucene/core/src/java/org/apache/lucene/search/DoubleValuesSource.java b/lucene/core/src/java/org/apache/lucene/search/DoubleValuesSource.java
@@ -250,14 +250,6 @@ public LongValuesSource rewrite(IndexSearcher searcher) throws IOException {
    */
   public static DoubleValues similarityToQueryVector(
       LeafReaderContext ctx, byte[] queryVector, String vectorField) throws IOException {
-    if (ctx.reader().getFieldInfos().fieldInfo(vectorField).getVectorEncoding()
-        != VectorEncoding.BYTE) {
-      throw new IllegalArgumentException(
-          "Field "
-              + vectorField
-              + " does not have the expected vector encoding: "
-              + VectorEncoding.BYTE);
-    }
     return new ByteVectorSimilarityValuesSource(queryVector, vectorField).getValues(ctx, null);
   }
 
diff --git a/lucene/core/src/java/org/apache/lucene/search/FloatVectorSimilarityValuesSource.java b/lucene/core/src/java/org/apache/lucene/search/FloatVectorSimilarityValuesSource.java
@@ -33,6 +33,7 @@
  */
 class FloatVectorSimilarityValuesSource extends VectorSimilarityValuesSource {
 
+  /** Creates a {@link FloatVectorSimilarityValuesSource} that scores on full precision vector values */
   public static DoubleValues fullPrecisionScores(
       LeafReaderContext ctx, float[] queryVector, String vectorField) throws IOException {
     return new FloatVectorSimilarityValuesSource(queryVector, vectorField, true).getValues(ctx, null);
diff --git a/lucene/core/src/test/org/apache/lucene/search/TestQuantizedVectorSimilarityValueSource.java b/lucene/core/src/test/org/apache/lucene/search/TestQuantizedVectorSimilarityValueSource.java
@@ -9,6 +9,7 @@
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.document.IntField;
+import org.apache.lucene.document.KnnByteVectorField;
 import org.apache.lucene.document.KnnFloatVectorField;
 import org.apache.lucene.index.DirectoryReader;
 import org.apache.lucene.index.IndexReader;
@@ -146,7 +147,6 @@ public void testFullPrecisionVectorSimilarityDVS() throws Exception {
 
       float[] queryVector = TestVectorUtil.randomVector(VECTOR_DIMENSION);
       try (IndexReader reader = DirectoryReader.open(dir)) {
-        FieldExistsQuery query = new FieldExistsQuery(KNN_FIELD);
         for (LeafReaderContext ctx : reader.leaves()) {
           DoubleValues fpSimValues = FloatVectorSimilarityValuesSource.fullPrecisionScores(ctx, queryVector, KNN_FIELD);
           DoubleValues quantizedSimValues = DoubleValuesSource.similarityToQueryVector(ctx, queryVector, KNN_FIELD);
@@ -180,4 +180,114 @@ public void testFullPrecisionVectorSimilarityDVS() throws Exception {
       }
     }
   }
+
+  @Test
+  public void testFullPrecisionByteVectorSimilarityDVS() throws Exception {
+    List<byte[]> vectors = new ArrayList<>();
+    int numVectors = atLeast(NUM_VECTORS);
+    int numSegments = random().nextInt(2, 10);
+    final VectorSimilarityFunction vectorSimilarityFunction =
+        VectorSimilarityFunction.values()[
+            random().nextInt(VectorSimilarityFunction.values().length)];
+
+    try (Directory dir = newDirectory()) {
+      int id = 0;
+
+      // index some 4 bit quantized vectors
+      try (IndexWriter w =
+               new IndexWriter(
+                   dir,
+                   newIndexWriterConfig().setCodec(TestUtil.alwaysKnnVectorsFormat(getKnnFormat(4))))) {
+        for (int j = 0; j < numSegments; j++) {
+          for (int i = 0; i < numVectors; i++) {
+            Document doc = new Document();
+            if (random().nextInt(100) < 30) {
+              // skip vector for some docs to create sparse vector field
+              doc.add(new IntField("has_vector", 0, Field.Store.YES));
+            } else {
+              byte[] vector = TestVectorUtil.randomVectorBytes(VECTOR_DIMENSION);
+              vectors.add(vector);
+              doc.add(new IntField("id", id++, Field.Store.YES));
+              doc.add(new KnnByteVectorField(KNN_FIELD, vector, vectorSimilarityFunction));
+              doc.add(new IntField("has_vector", 1, Field.Store.YES));
+            }
+            w.addDocument(doc);
+            w.flush();
+          }
+        }
+        // add a segment with no vectors
+        for (int i = 0; i < 100; i++) {
+          Document doc = new Document();
+          doc.add(new IntField("has_vector", 0, Field.Store.YES));
+          w.addDocument(doc);
+        }
+        w.flush();
+      }
+
+      // index some 7 bit quantized vectors
+      try (IndexWriter w =
+               new IndexWriter(
+                   dir,
+                   newIndexWriterConfig().setCodec(TestUtil.alwaysKnnVectorsFormat(getKnnFormat(7))))) {
+        for (int j = 0; j < numSegments; j++) {
+          for (int i = 0; i < numVectors; i++) {
+            Document doc = new Document();
+            if (random().nextInt(100) < 30) {
+              // skip vector for some docs to create sparse vector field
+              doc.add(new IntField("has_vector", 0, Field.Store.YES));
+            } else {
+              byte[] vector = TestVectorUtil.randomVectorBytes(VECTOR_DIMENSION);
+              vectors.add(vector);
+              doc.add(new IntField("id", id++, Field.Store.YES));
+              doc.add(new KnnByteVectorField(KNN_FIELD, vector, vectorSimilarityFunction));
+              doc.add(new IntField("has_vector", 1, Field.Store.YES));
+            }
+            w.addDocument(doc);
+            w.flush();
+          }
+        }
+        // add a segment with no vectors
+        for (int i = 0; i < 100; i++) {
+          Document doc = new Document();
+          doc.add(new IntField("has_vector", 0, Field.Store.YES));
+          w.addDocument(doc);
+        }
+        w.flush();
+      }
+
+      byte[] queryVector = TestVectorUtil.randomVectorBytes(VECTOR_DIMENSION);
+      try (IndexReader reader = DirectoryReader.open(dir)) {
+        for (LeafReaderContext ctx : reader.leaves()) {
+          DoubleValues fpSimValues = ByteVectorSimilarityValuesSource.fullPrecisionScores(ctx, queryVector, KNN_FIELD);
+          DoubleValues quantizedSimValues = DoubleValuesSource.similarityToQueryVector(ctx, queryVector, KNN_FIELD);
+          // validate when segment has no vectors
+          if (fpSimValues == DoubleValues.EMPTY || quantizedSimValues == DoubleValues.EMPTY) {
+            assertEquals(fpSimValues, quantizedSimValues);
+            assertNull(ctx.reader().getByteVectorValues(KNN_FIELD));
+            continue;
+          }
+          StoredFields storedFields = ctx.reader().storedFields();
+          VectorScorer quantizedScorer =
+              ctx.reader().getByteVectorValues(KNN_FIELD).scorer(queryVector);
+          DocIdSetIterator disi = quantizedScorer.iterator();
+          while (disi.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
+            int doc = disi.docID();
+            fpSimValues.advanceExact(doc);
+            quantizedSimValues.advanceExact(doc);
+            int idValue = Integer.parseInt(storedFields.document(doc).get("id"));
+            byte[] docVector = vectors.get(idValue);
+            assert docVector != null : "Vector for id " + idValue + " not found";
+            // validate full precision vector scores
+            double expectedFpScore = vectorSimilarityFunction.compare(queryVector, docVector);
+            double actualFpScore = fpSimValues.doubleValue();
+            assertEquals(expectedFpScore, actualFpScore, 1e-5);
+            // validate quantized vector scores
+            double expectedQScore = quantizedScorer.score();
+            double actualQScore = quantizedSimValues.doubleValue();
+            assertEquals(expectedQScore, actualQScore, 1e-5);
+          }
+        }
+      }
+    }
+  }
 }