late I recore query with test and tidy

vigyasharma · vigyasharma · commit 9108ad1a1687 · 2025-07-03T13:08:28.000-07:00
diff --git a/lucene/core/src/java/org/apache/lucene/search/LateInteractionRescorer.java b/lucene/core/src/java/org/apache/lucene/search/LateInteractionRescorer.java
@@ -6,9 +6,9 @@
  * Rescores top N results from a first pass query using a {@link LateInteractionFloatValuesSource}
  *
  * <p>Typically, you run a low-cost first pass query to collect results from across the index, then
- * use this rescorer to rerank top N hits using multi-vectors, usually from a late interaction model.
- * Multi-vectors should be indexed in the {@link org.apache.lucene.document.LateInteractionField}
- * provided to rescorer.
+ * use this rescorer to rerank top N hits using multi-vectors, usually from a late interaction
+ * model. Multi-vectors should be indexed in the {@link
+ * org.apache.lucene.document.LateInteractionField} provided to rescorer.
  *
  * @lucene.experimental
  */
@@ -18,26 +18,27 @@ public LateInteractionRescorer(LateInteractionFloatValuesSource valuesSource) {
     super(valuesSource);
   }
 
-  /**
-   * Creates a LateInteractionRescorer for provided query vector.
-   */
+  /** Creates a LateInteractionRescorer for provided query vector. */
   public static LateInteractionRescorer create(String fieldName, float[][] queryVector) {
     return create(fieldName, queryVector, VectorSimilarityFunction.COSINE);
   }
 
   /**
    * Creates a LateInteractionRescorer for provided query vector.
    *
-   * <p>Top N results from a first pass query are rescored based on the similarity between {@code queryVector} and
-   * the multi-vector indexed in {@code fieldName}. If document does not have a value indexed in {@code fieldName},
-   * a 0f score is assigned.
+   * <p>Top N results from a first pass query are rescored based on the similarity between {@code
+   * queryVector} and the multi-vector indexed in {@code fieldName}. If document does not have a
+   * value indexed in {@code fieldName}, a 0f score is assigned.
    *
-   * @param fieldName the {@link org.apache.lucene.document.LateInteractionField} used for reranking.
+   * @param fieldName the {@link org.apache.lucene.document.LateInteractionField} used for
+   *     reranking.
    * @param queryVector query multi-vector to use for similarity comparison
    * @param vectorSimilarityFunction function used for vector similarity comparisons
    */
-  public static LateInteractionRescorer create(String fieldName, float[][] queryVector, VectorSimilarityFunction vectorSimilarityFunction) {
-    final LateInteractionFloatValuesSource valuesSource = new LateInteractionFloatValuesSource(fieldName, queryVector, vectorSimilarityFunction);
+  public static LateInteractionRescorer create(
+      String fieldName, float[][] queryVector, VectorSimilarityFunction vectorSimilarityFunction) {
+    final LateInteractionFloatValuesSource valuesSource =
+        new LateInteractionFloatValuesSource(fieldName, queryVector, vectorSimilarityFunction);
     return new LateInteractionRescorer(valuesSource);
   }
 
@@ -49,16 +50,19 @@ protected float combine(float firstPassScore, boolean valuePresent, double sourc
   /**
    * Creates a LateInteractionRescorer for provided query vector.
    *
-   * <p>Top N results from a first pass query are rescored based on the similarity between {@code queryVector} and
-   * the multi-vector indexed in {@code fieldName}. Falls back to score from the first pass query if a document
-   * does not have a value indexed in {@code fieldName}.
+   * <p>Top N results from a first pass query are rescored based on the similarity between {@code
+   * queryVector} and the multi-vector indexed in {@code fieldName}. Falls back to score from the
+   * first pass query if a document does not have a value indexed in {@code fieldName}.
    *
-   * @param fieldName the {@link org.apache.lucene.document.LateInteractionField} used for reranking.
+   * @param fieldName the {@link org.apache.lucene.document.LateInteractionField} used for
+   *     reranking.
    * @param queryVector query multi-vector to use for similarity comparison
    * @param vectorSimilarityFunction function used for vector similarity comparisons.
    */
-  public static LateInteractionRescorer withFallbackToFirstPassScore(String fieldName, float[][] queryVector, VectorSimilarityFunction vectorSimilarityFunction) {
-    final LateInteractionFloatValuesSource valuesSource = new LateInteractionFloatValuesSource(fieldName, queryVector, vectorSimilarityFunction);
+  public static LateInteractionRescorer withFallbackToFirstPassScore(
+      String fieldName, float[][] queryVector, VectorSimilarityFunction vectorSimilarityFunction) {
+    final LateInteractionFloatValuesSource valuesSource =
+        new LateInteractionFloatValuesSource(fieldName, queryVector, vectorSimilarityFunction);
     return new LateInteractionRescorer(valuesSource) {
       @Override
       protected float combine(float firstPassScore, boolean valuePresent, double sourceValue) {
diff --git a/lucene/core/src/java/org/apache/lucene/search/RescoreTopNQuery.java b/lucene/core/src/java/org/apache/lucene/search/RescoreTopNQuery.java
@@ -154,15 +154,26 @@ public static Query createFullPrecisionRescorerQuery(
    * Creates a {@code RescoreTopNQuery} that computes top N results using multi-vector similarity
    * comparisons against a late interaction field.
    *
+   * <p>Note: This query computes late interaction field similarity for the entire match-set of
+   * wrapped query, and returns a new query with only top-N hits in the match-set. This is typically
+   * useful in combining a query's results with other queries for hybrid search. To simply rerank
+   * the top N hits without scoring entire match-set, see {@link LateInteractionRescorer}.
+   *
    * @param in the inner Query to rescore
    * @param n number of results to keep
-   * @param fieldName the {@link org.apache.lucene.document.LateInteractionField} for recomputing top N hits
+   * @param fieldName the {@link org.apache.lucene.document.LateInteractionField} for recomputing
+   *     top N hits
    * @param queryVector query multi-vector to use for similarity comparisons
    * @param vectorSimilarityFunction function to use for vector similarity comparisons.
    */
   public static Query createLateInteractionQuery(
-      Query in, int n, String fieldName, float[][] queryVector, VectorSimilarityFunction vectorSimilarityFunction) {
-    final LateInteractionFloatValuesSource valuesSource = new LateInteractionFloatValuesSource(fieldName, queryVector, vectorSimilarityFunction);
+      Query in,
+      int n,
+      String fieldName,
+      float[][] queryVector,
+      VectorSimilarityFunction vectorSimilarityFunction) {
+    final LateInteractionFloatValuesSource valuesSource =
+        new LateInteractionFloatValuesSource(fieldName, queryVector, vectorSimilarityFunction);
     return new RescoreTopNQuery(in, valuesSource, n);
   }
 }
diff --git a/lucene/core/src/test/org/apache/lucene/search/TestRescoreTopNQuery.java b/lucene/core/src/test/org/apache/lucene/search/TestRescoreTopNQuery.java
@@ -17,18 +17,25 @@
 package org.apache.lucene.search;
 
 import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.HashMap;
+import java.util.List;
 import java.util.Map;
 import java.util.Random;
+import java.util.Set;
+import java.util.stream.Collectors;
 import org.apache.lucene.codecs.lucene99.Lucene99HnswScalarQuantizedVectorsFormat;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.document.IntField;
 import org.apache.lucene.document.KnnFloatVectorField;
+import org.apache.lucene.document.LateInteractionField;
 import org.apache.lucene.index.DirectoryReader;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.IndexWriter;
 import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.index.StoredFields;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.index.VectorSimilarityFunction;
 import org.apache.lucene.store.ByteBuffersDirectory;
@@ -156,6 +163,87 @@ public void testMissingDoubleValues() throws IOException {
     }
   }
 
+  public void testLateInteractionQuery() throws Exception {
+    final String LATE_I_FIELD = "li_vector";
+    final String KNN_FIELD = "knn_vector";
+    List<float[][]> corpus = new ArrayList<>();
+    final int numDocs = atLeast(1000);
+    final int numSegments = random().nextInt(2, 10);
+    final int dim = 128;
+    final VectorSimilarityFunction vectorSimilarityFunction =
+        VectorSimilarityFunction.values()[
+            random().nextInt(VectorSimilarityFunction.values().length)];
+    LateInteractionFloatValuesSource.ScoreFunction scoreFunction =
+        LateInteractionFloatValuesSource.ScoreFunction.values()[
+            random().nextInt(LateInteractionFloatValuesSource.ScoreFunction.values().length)];
+
+    try (Directory dir = newDirectory()) {
+      int id = 0;
+      try (IndexWriter w = new IndexWriter(dir, newIndexWriterConfig())) {
+        for (int j = 0; j < numSegments; j++) {
+          for (int i = 0; i < numDocs; i++) {
+            Document doc = new Document();
+            if (random().nextInt(100) < 30) {
+              // skip value for some docs to create sparse field
+              doc.add(new IntField("has_li_vector", 0, Field.Store.YES));
+            } else {
+              float[][] value = createMultiVector(dim);
+              corpus.add(value);
+              doc.add(new IntField("id", id++, Field.Store.YES));
+              doc.add(new LateInteractionField(LATE_I_FIELD, value));
+              doc.add(new KnnFloatVectorField(KNN_FIELD, randomFloatVector(dim, random())));
+              doc.add(new IntField("has_li_vector", 1, Field.Store.YES));
+            }
+            w.addDocument(doc);
+            w.flush();
+          }
+        }
+        // add a segment with no vectors
+        for (int i = 0; i < 100; i++) {
+          Document doc = new Document();
+          doc.add(new IntField("has_li_vector", 0, Field.Store.YES));
+          w.addDocument(doc);
+        }
+        w.flush();
+      }
+
+      float[][] lateIQueryVector = createMultiVector(dim);
+      float[] knnQueryVector = randomFloatVector(dim, random());
+      KnnFloatVectorQuery knnQuery = new KnnFloatVectorQuery(KNN_FIELD, knnQueryVector, 50);
+
+      try (IndexReader reader = DirectoryReader.open(dir)) {
+        final int topN = 10;
+        IndexSearcher s = new IndexSearcher(reader);
+        TopDocs knnHits = s.search(knnQuery, 5 * topN);
+        Set<Integer> knnHitDocs =
+            Arrays.stream(knnHits.scoreDocs).map(k -> k.doc).collect(Collectors.toSet());
+        Query lateIQuery =
+            RescoreTopNQuery.createLateInteractionQuery(
+                knnQuery, topN, LATE_I_FIELD, lateIQueryVector, vectorSimilarityFunction);
+        TopDocs lateIHits = s.search(lateIQuery, 3 * topN);
+        // total match-set for RescoreTopNQuery is topN
+        assertEquals(topN, lateIHits.scoreDocs.length);
+        StoredFields storedFields = reader.storedFields();
+        for (ScoreDoc hit : lateIHits.scoreDocs) {
+          assertTrue(knnHitDocs.contains(hit.doc));
+          int idValue = Integer.parseInt(storedFields.document(hit.doc).get("id"));
+          float[][] docVector = corpus.get(idValue);
+          float expected =
+              scoreFunction.compare(lateIQueryVector, docVector, vectorSimilarityFunction);
+          assertEquals(expected, hit.score, 1e-5);
+        }
+      }
+    }
+  }
+
+  private float[][] createMultiVector(int dimension) {
+    float[][] value = new float[random().nextInt(3, 12)][];
+    for (int i = 0; i < value.length; i++) {
+      value[i] = randomFloatVector(dimension, random());
+    }
+    return value;
+  }
+
   private float[] randomFloatVector(int dimension, Random random) {
     float[] vector = new float[dimension];
     for (int i = 0; i < dimension; i++) {
diff --git a/lucene/queries/src/java/org/apache/lucene/queries/function/FunctionScoreQuery.java b/lucene/queries/src/java/org/apache/lucene/queries/function/FunctionScoreQuery.java
@@ -20,14 +20,12 @@
 import java.io.IOException;
 import java.util.Objects;
 import org.apache.lucene.index.LeafReaderContext;
-import org.apache.lucene.index.VectorSimilarityFunction;
 import org.apache.lucene.search.BooleanClause;
 import org.apache.lucene.search.DoubleValues;
 import org.apache.lucene.search.DoubleValuesSource;
 import org.apache.lucene.search.Explanation;
 import org.apache.lucene.search.FilterScorer;
 import org.apache.lucene.search.IndexSearcher;
-import org.apache.lucene.search.LateInteractionFloatValuesSource;
 import org.apache.lucene.search.Matches;
 import org.apache.lucene.search.Query;
 import org.apache.lucene.search.QueryVisitor;
@@ -73,28 +71,6 @@ public DoubleValuesSource getSource() {
     return source;
   }
 
-  /**
-   * Returns a FunctionScoreQuery that re-scores hits from the wrapped query using late-interaction
-   * scores between provided query and indexed document multi-vectors.
-   *
-   * <p>Document multi-vectors are indexed using {@link
-   * org.apache.lucene.document.LateInteractionField}.
-   *
-   * @param in the query to re-score
-   * @param fieldName field containing document multi-vectors for re-scoring
-   * @param queryVector query multi-vector
-   * @param vectorSimilarityFunction vector similarity function used for computing scores
-   */
-  public static FunctionScoreQuery lateInteractionFloatRerankQuery(
-      Query in,
-      String fieldName,
-      float[][] queryVector,
-      VectorSimilarityFunction vectorSimilarityFunction) {
-    LateInteractionFloatValuesSource scoreSource =
-        new LateInteractionFloatValuesSource(fieldName, queryVector, vectorSimilarityFunction);
-    return new FunctionScoreQuery(in, scoreSource);
-  }
-
   /**
    * Returns a FunctionScoreQuery where the scores of a wrapped query are multiplied by the value of
    * a DoubleValuesSource.
diff --git a/lucene/queries/src/test/org/apache/lucene/queries/function/TestFunctionScoreQuery.java b/lucene/queries/src/test/org/apache/lucene/queries/function/TestFunctionScoreQuery.java
@@ -18,18 +18,9 @@
 package org.apache.lucene.queries.function;
 
 import java.io.IOException;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.List;
-import java.util.Random;
-import java.util.Set;
 import java.util.concurrent.atomic.AtomicReference;
-import java.util.stream.Collectors;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
-import org.apache.lucene.document.IntField;
-import org.apache.lucene.document.KnnFloatVectorField;
-import org.apache.lucene.document.LateInteractionField;
 import org.apache.lucene.document.NumericDocValuesField;
 import org.apache.lucene.document.TextField;
 import org.apache.lucene.expressions.Expression;
@@ -40,21 +31,16 @@
 import org.apache.lucene.index.IndexWriter;
 import org.apache.lucene.index.IndexWriterConfig;
 import org.apache.lucene.index.LeafReaderContext;
-import org.apache.lucene.index.StoredFields;
 import org.apache.lucene.index.Term;
-import org.apache.lucene.index.VectorSimilarityFunction;
 import org.apache.lucene.search.BooleanClause;
 import org.apache.lucene.search.BooleanQuery;
 import org.apache.lucene.search.BoostQuery;
 import org.apache.lucene.search.DoubleValuesSource;
 import org.apache.lucene.search.Explanation;
 import org.apache.lucene.search.IndexSearcher;
-import org.apache.lucene.search.KnnFloatVectorQuery;
-import org.apache.lucene.search.LateInteractionFloatValuesSource;
 import org.apache.lucene.search.MatchAllDocsQuery;
 import org.apache.lucene.search.PhraseQuery;
 import org.apache.lucene.search.Query;
-import org.apache.lucene.search.ScoreDoc;
 import org.apache.lucene.search.ScoreMode;
 import org.apache.lucene.search.TermQuery;
 import org.apache.lucene.search.TopDocs;
@@ -391,91 +377,4 @@ public void testQueryMatchesCount() throws Exception {
     }
     assertEquals(searchCount, weightCount);
   }
-
-  public void testLateInteractionQuery() throws Exception {
-    final String LATE_I_FIELD = "li_vector";
-    final String KNN_FIELD = "knn_vector";
-    List<float[][]> corpus = new ArrayList<>();
-    final int numDocs = atLeast(1000);
-    final int numSegments = random().nextInt(2, 10);
-    final int dim = 128;
-    final VectorSimilarityFunction vectorSimilarityFunction =
-        VectorSimilarityFunction.values()[
-            random().nextInt(VectorSimilarityFunction.values().length)];
-    LateInteractionFloatValuesSource.ScoreFunction scoreFunction =
-        LateInteractionFloatValuesSource.ScoreFunction.values()[
-            random().nextInt(LateInteractionFloatValuesSource.ScoreFunction.values().length)];
-
-    try (Directory dir = newDirectory()) {
-      int id = 0;
-      try (IndexWriter w = new IndexWriter(dir, newIndexWriterConfig())) {
-        for (int j = 0; j < numSegments; j++) {
-          for (int i = 0; i < numDocs; i++) {
-            Document doc = new Document();
-            if (random().nextInt(100) < 30) {
-              // skip value for some docs to create sparse field
-              doc.add(new IntField("has_li_vector", 0, Field.Store.YES));
-            } else {
-              float[][] value = createMultiVector(dim);
-              corpus.add(value);
-              doc.add(new IntField("id", id++, Field.Store.YES));
-              doc.add(new LateInteractionField(LATE_I_FIELD, value));
-              doc.add(new KnnFloatVectorField(KNN_FIELD, randomVector(dim)));
-              doc.add(new IntField("has_li_vector", 1, Field.Store.YES));
-            }
-            w.addDocument(doc);
-            w.flush();
-          }
-        }
-        // add a segment with no vectors
-        for (int i = 0; i < 100; i++) {
-          Document doc = new Document();
-          doc.add(new IntField("has_li_vector", 0, Field.Store.YES));
-          w.addDocument(doc);
-        }
-        w.flush();
-      }
-
-      float[][] lateIQueryVector = createMultiVector(dim);
-      float[] knnQueryVector = randomVector(dim);
-      KnnFloatVectorQuery knnQuery = new KnnFloatVectorQuery(KNN_FIELD, knnQueryVector, 50);
-
-      try (IndexReader reader = DirectoryReader.open(dir)) {
-        IndexSearcher s = new IndexSearcher(reader);
-        TopDocs knnHits = s.search(knnQuery, 50);
-        Set<Integer> knnHitDocs =
-            Arrays.stream(knnHits.scoreDocs).map(k -> k.doc).collect(Collectors.toSet());
-        FunctionScoreQuery lateIQuery =
-            FunctionScoreQuery.lateInteractionFloatRerankQuery(
-                knnQuery, LATE_I_FIELD, lateIQueryVector, vectorSimilarityFunction);
-        TopDocs lateIHits = s.search(lateIQuery, 10);
-        StoredFields storedFields = reader.storedFields();
-        for (ScoreDoc hit : lateIHits.scoreDocs) {
-          assertTrue(knnHitDocs.contains(hit.doc));
-          int idValue = Integer.parseInt(storedFields.document(hit.doc).get("id"));
-          float[][] docVector = corpus.get(idValue);
-          float expected =
-              scoreFunction.compare(lateIQueryVector, docVector, vectorSimilarityFunction);
-          assertEquals(expected, hit.score, 1e-5);
-        }
-      }
-    }
-  }
-
-  private float[] randomVector(int dim) {
-    float[] v = new float[dim];
-    Random random = random();
-    for (int i = 0; i < dim; i++) {
-      v[i] = random.nextFloat();
-    }
-    return v;
-  }
-
-  private float[][] createMultiVector(int dimension) {
-    float[][] value = new float[random().nextInt(3, 12)][];
-    for (int i = 0; i < value.length; i++) {
-      value[i] = randomVector(dimension);
-    }
-    return value;
-  }
 }