use TopDocs and not ScoreDocs

markjhoy · markjhoy · commit d760242c82bb · 2025-10-06T20:36:31.000-04:00
diff --git a/server/src/main/java/org/elasticsearch/search/diversification/ResultDiversification.java b/server/src/main/java/org/elasticsearch/search/diversification/ResultDiversification.java
@@ -10,8 +10,8 @@
 package org.elasticsearch.search.diversification;
 
 import org.apache.lucene.index.VectorSimilarityFunction;
-import org.elasticsearch.search.SearchHit;
-import org.elasticsearch.search.SearchHits;
+import org.apache.lucene.search.ScoreDoc;
+import org.apache.lucene.search.TopDocs;
 import org.elasticsearch.search.vectors.VectorData;
 
 import java.io.IOException;
@@ -23,24 +23,24 @@
  */
 public abstract class ResultDiversification {
 
-    public abstract SearchHits diversify(SearchHits hits, ResultDiversificationContext diversificationContext) throws IOException;
+    public abstract TopDocs diversify(TopDocs hits, ResultDiversificationContext diversificationContext) throws IOException;
 
     protected Map<Integer, VectorData> getFieldVectorsForHits(
-        SearchHit[] searchHits,
+        ScoreDoc[] docs,
         ResultDiversificationContext context,
         Map<Integer, Integer> docIdIndexMapping
     ) {
         Map<Integer, VectorData> fieldVectors = new HashMap<>();
-        for (int i = 0; i < searchHits.length; i++) {
-            SearchHit hit = searchHits[i];
-            int docId = hit.docId();
+        for (int i = 0; i < docs.length; i++) {
+            ScoreDoc hit = docs[i];
+            int docId = hit.doc;
             docIdIndexMapping.put(docId, i);
-            Object collapseValue = hit.field(context.getField()).getValue();
-            if (collapseValue instanceof float[] vecData) {
-                fieldVectors.put(docId, new VectorData(vecData));
-            } else if (collapseValue instanceof byte[] byteVecData) {
-                fieldVectors.put(docId, new VectorData(byteVecData));
-            }
+            // hit.Object collapseValue = hit.field(context.getField()).getValue();
+            // if (collapseValue instanceof float[] vecData) {
+            // fieldVectors.put(docId, new VectorData(vecData));
+            // } else if (collapseValue instanceof byte[] byteVecData) {
+            // fieldVectors.put(docId, new VectorData(byteVecData));
+            // }
         }
         return fieldVectors;
     }
diff --git a/server/src/main/java/org/elasticsearch/search/diversification/ResultDiversificationContext.java b/server/src/main/java/org/elasticsearch/search/diversification/ResultDiversificationContext.java
@@ -13,26 +13,32 @@
 import org.elasticsearch.index.mapper.vectors.DenseVectorFieldMapper;
 import org.elasticsearch.search.vectors.VectorData;
 
+import java.util.Map;
+import java.util.Set;
+
 public abstract class ResultDiversificationContext {
     private final String field;
     private final int numCandidates;
     private final DenseVectorFieldMapper fieldMapper;
     private final IndexVersion indexVersion;
     private final VectorData queryVector;
+    private final Map<Integer, VectorData> fieldVectors;
 
     // Field _must_ be a dense_vector type
     protected ResultDiversificationContext(
         String field,
         int numCandidates,
         VectorData queryVector,
         DenseVectorFieldMapper fieldMapper,
-        IndexVersion indexVersion
+        IndexVersion indexVersion,
+        Map<Integer, VectorData> fieldVectors
     ) {
         this.field = field;
         this.numCandidates = numCandidates;
         this.fieldMapper = fieldMapper;
         this.indexVersion = indexVersion;
         this.queryVector = queryVector;
+        this.fieldVectors = fieldVectors;
     }
 
     public String getField() {
@@ -58,4 +64,12 @@ public IndexVersion getIndexVersion() {
     public VectorData getQueryVector() {
         return queryVector;
     }
+
+    public VectorData getFieldVector(int docId) {
+        return fieldVectors.getOrDefault(docId, null);
+    }
+
+    public Set<Map.Entry<Integer, VectorData>> getFieldVectorsEntrySet() {
+        return fieldVectors.entrySet();
+    }
 }
diff --git a/server/src/main/java/org/elasticsearch/search/diversification/mmr/MMRResultDiversification.java b/server/src/main/java/org/elasticsearch/search/diversification/mmr/MMRResultDiversification.java
@@ -10,9 +10,9 @@
 package org.elasticsearch.search.diversification.mmr;
 
 import org.apache.lucene.index.VectorSimilarityFunction;
+import org.apache.lucene.search.ScoreDoc;
+import org.apache.lucene.search.TopDocs;
 import org.elasticsearch.index.mapper.vectors.DenseVectorFieldMapper;
-import org.elasticsearch.search.SearchHit;
-import org.elasticsearch.search.SearchHits;
 import org.elasticsearch.search.diversification.ResultDiversification;
 import org.elasticsearch.search.diversification.ResultDiversificationContext;
 import org.elasticsearch.search.vectors.VectorData;
@@ -26,20 +26,21 @@
 public class MMRResultDiversification extends ResultDiversification {
 
     @Override
-    public SearchHits diversify(SearchHits hits, ResultDiversificationContext diversificationContext) throws IOException {
-        if (hits == null || ((diversificationContext instanceof MMRResultDiversificationContext) == false)) {
-            return hits;
+    public TopDocs diversify(TopDocs topDocs, ResultDiversificationContext diversificationContext) throws IOException {
+        if (topDocs == null || ((diversificationContext instanceof MMRResultDiversificationContext) == false)) {
+            return topDocs;
         }
 
         MMRResultDiversificationContext context = (MMRResultDiversificationContext) diversificationContext;
-        SearchHit[] searchHits = hits.getHits();  // NOTE: by reference, not new array
 
-        if (searchHits.length == 0) {
-            return hits;
+        if (topDocs.scoreDocs == null || topDocs.scoreDocs.length == 0) {
+            return topDocs;
         }
 
         Map<Integer, Integer> docIdIndexMapping = new HashMap<>();
-        Map<Integer, VectorData> fieldVectors = getFieldVectorsForHits(searchHits, context, docIdIndexMapping);
+        for (int i = 0; i < topDocs.scoreDocs.length; i++) {
+            docIdIndexMapping.put(topDocs.scoreDocs[i].doc, i);
+        }
 
         VectorSimilarityFunction similarityFunction = DenseVectorFieldMapper.VectorSimilarity.MAX_INNER_PRODUCT.vectorSimilarityFunction(
             context.getIndexVersion(),
@@ -52,48 +53,55 @@ public SearchHits diversify(SearchHits hits, ResultDiversificationContext divers
         // always add the highest scoring doc to the list
         int highestScoreDocId = -1;
         float highestScore = Float.MIN_VALUE;
-        for (SearchHit hit : searchHits) {
-            if (hit.getScore() > highestScore) {
-                highestScoreDocId = hit.docId();
-                highestScore = hit.getScore();
+        for (ScoreDoc doc : topDocs.scoreDocs) {
+            if (doc.score > highestScore) {
+                highestScoreDocId = doc.doc;
+                highestScore = doc.score;
             }
         }
         selectedDocIds.add(highestScoreDocId);
 
         // test the vector to see if we are using floats or bytes
-        VectorData firstVec = fieldVectors.get(highestScoreDocId);
+        VectorData firstVec = context.getFieldVector(highestScoreDocId);
         boolean useFloat = firstVec.isFloat();
 
         // cache the similarity scores for the query vector vs. searchHits
-        Map<Integer, Float> querySimilarity = getQuerySimilarityForDocs(searchHits, fieldVectors, similarityFunction, useFloat, context);
+        Map<Integer, Float> querySimilarity = getQuerySimilarityForDocs(topDocs.scoreDocs, similarityFunction, useFloat, context);
 
         Map<Integer, Map<Integer, Float>> cachedSimilarities = new HashMap<>();
         int numCandidates = context.getNumCandidates();
 
-        for (int x = 0; x < numCandidates && selectedDocIds.size() < numCandidates && selectedDocIds.size() < searchHits.length; x++) {
+        for (int x = 0; x < numCandidates
+            && selectedDocIds.size() < numCandidates
+            && selectedDocIds.size() < topDocs.scoreDocs.length; x++) {
             int thisMaxMMRDocId = -1;
             float thisMaxMMRScore = Float.NEGATIVE_INFINITY;
-            for (SearchHit thisHit : searchHits) {
-                int docId = thisHit.docId();
+            for (ScoreDoc doc : topDocs.scoreDocs) {
+                int docId = doc.doc;
 
                 if (selectedDocIds.contains(docId)) {
                     continue;
                 }
 
-                var thisDocVector = fieldVectors.get(docId);
+                var thisDocVector = context.getFieldVector(docId);
+                if (thisDocVector == null) {
+                    continue;
+                }
+
                 var cachedScoresForDoc = cachedSimilarities.getOrDefault(docId, new HashMap<>());
 
                 // compute MMR scores for remaining searchHits
                 float highestMMRScore = getHighestScoreForSelectedVectors(
-                    fieldVectors,
+                    docId,
+                    context,
                     similarityFunction,
                     useFloat,
                     thisDocVector,
                     cachedScoresForDoc
                 );
 
                 // compute MMR
-                float querySimilarityScore = querySimilarity.getOrDefault(thisHit.docId(), 0.0f);
+                float querySimilarityScore = querySimilarity.getOrDefault(doc.doc, 0.0f);
                 float mmr = (context.getLambda() * querySimilarityScore) - ((1 - context.getLambda()) * highestMMRScore);
                 if (mmr > thisMaxMMRScore) {
                     thisMaxMMRScore = mmr;
@@ -110,34 +118,29 @@ public SearchHits diversify(SearchHits hits, ResultDiversificationContext divers
         }
 
         // our return should be only those searchHits that are selected
-        SearchHit[] ret = new SearchHit[selectedDocIds.size()];
+        ScoreDoc[] ret = new ScoreDoc[selectedDocIds.size()];
         for (int i = 0; i < selectedDocIds.size(); i++) {
             int scoredDocIndex = docIdIndexMapping.get(selectedDocIds.get(i));
-            ret[i] = searchHits[scoredDocIndex];
+            ret[i] = topDocs.scoreDocs[scoredDocIndex];
         }
 
-        // cleanup for GC
-        searchHits = null;
-
-        return new SearchHits(
-            ret,
-            hits.getTotalHits(),
-            hits.getMaxScore(),
-            hits.getSortFields(),
-            hits.getCollapseField(),
-            hits.getCollapseValues()
-        );
+        return new TopDocs(topDocs.totalHits, ret);
     }
 
     private float getHighestScoreForSelectedVectors(
-        Map<Integer, VectorData> selectedVectors,
+        int docId,
+        MMRResultDiversificationContext context,
         VectorSimilarityFunction similarityFunction,
         boolean useFloat,
         VectorData thisDocVector,
         Map<Integer, Float> cachedScoresForDoc
     ) {
         float highestScore = Float.MIN_VALUE;
-        for (var vec : selectedVectors.entrySet()) {
+        for (var vec : context.getFieldVectorsEntrySet()) {
+            if (vec.getKey().equals(docId)) {
+                continue;
+            }
+
             if (cachedScoresForDoc.containsKey(vec.getKey())) {
                 float score = cachedScoresForDoc.get(vec.getKey());
                 if (score > highestScore) {
@@ -156,19 +159,17 @@ private float getHighestScoreForSelectedVectors(
     }
 
     protected Map<Integer, Float> getQuerySimilarityForDocs(
-        SearchHit[] searchHits,
-        Map<Integer, VectorData> fieldVectors,
+        ScoreDoc[] docs,
         VectorSimilarityFunction similarityFunction,
         boolean useFloat,
         ResultDiversificationContext context
     ) {
         Map<Integer, Float> querySimilarity = new HashMap<>();
-        for (SearchHit searchHit : searchHits) {
-            int docId = searchHit.docId();
-            VectorData vectorData = fieldVectors.get(docId);
+        for (ScoreDoc doc : docs) {
+            VectorData vectorData = context.getFieldVector(doc.doc);
             if (vectorData != null) {
                 float querySimilarityScore = getVectorComparisonScore(similarityFunction, useFloat, vectorData, context.getQueryVector());
-                querySimilarity.put(docId, querySimilarityScore);
+                querySimilarity.put(doc.doc, querySimilarityScore);
             }
         }
         return querySimilarity;
diff --git a/server/src/main/java/org/elasticsearch/search/diversification/mmr/MMRResultDiversificationContext.java b/server/src/main/java/org/elasticsearch/search/diversification/mmr/MMRResultDiversificationContext.java
@@ -14,6 +14,8 @@
 import org.elasticsearch.search.diversification.ResultDiversificationContext;
 import org.elasticsearch.search.vectors.VectorData;
 
+import java.util.Map;
+
 public class MMRResultDiversificationContext extends ResultDiversificationContext {
 
     private final float lambda;
@@ -24,9 +26,10 @@ public MMRResultDiversificationContext(
         int numCandidates,
         VectorData queryVector,
         DenseVectorFieldMapper fieldMapper,
-        IndexVersion indexVersion
+        IndexVersion indexVersion,
+        Map<Integer, VectorData> fieldVectors
     ) {
-        super(field, numCandidates, queryVector, fieldMapper, indexVersion);
+        super(field, numCandidates, queryVector, fieldMapper, indexVersion, fieldVectors);
         this.lambda = lambda;
     }
 
diff --git a/server/src/test/java/org/elasticsearch/search/diversification/mmr/MMRResultDiversificationTests.java b/server/src/test/java/org/elasticsearch/search/diversification/mmr/MMRResultDiversificationTests.java