cleanup and complete implementation

markjhoy · markjhoy · commit 23494e446eb8 · 2025-10-03T18:04:02.000-04:00
diff --git a/server/src/main/java/org/elasticsearch/search/diversification/ResultDiversification.java b/server/src/main/java/org/elasticsearch/search/diversification/ResultDiversification.java
@@ -10,7 +10,6 @@
 package org.elasticsearch.search.diversification;
 
 import org.apache.lucene.index.VectorSimilarityFunction;
-import org.apache.lucene.search.Explanation;
 import org.elasticsearch.search.SearchHit;
 import org.elasticsearch.search.SearchHits;
 import org.elasticsearch.search.vectors.VectorData;
@@ -26,20 +25,14 @@ public abstract class ResultDiversification {
 
     public abstract SearchHits diversify(SearchHits hits, ResultDiversificationContext diversificationContext) throws IOException;
 
-    public abstract Explanation explain(
-        int topLevelDocId,
-        ResultDiversificationContext diversificationContext,
-        Explanation sourceExplanation
-    ) throws IOException;
-
     protected Map<Integer, VectorData> getFieldVectorsForHits(
-        SearchHit[] hits,
+        SearchHit[] searchHits,
         ResultDiversificationContext context,
         Map<Integer, Integer> docIdIndexMapping
     ) {
         Map<Integer, VectorData> fieldVectors = new HashMap<>();
-        for (int i = 0; i < hits.length; i++) {
-            SearchHit hit = hits[i];
+        for (int i = 0; i < searchHits.length; i++) {
+            SearchHit hit = searchHits[i];
             int docId = hit.docId();
             docIdIndexMapping.put(docId, i);
             Object collapseValue = hit.field(context.getField()).getValue();
diff --git a/server/src/main/java/org/elasticsearch/search/diversification/mmr/MMRResultDiversification.java b/server/src/main/java/org/elasticsearch/search/diversification/mmr/MMRResultDiversification.java
@@ -10,7 +10,6 @@
 package org.elasticsearch.search.diversification.mmr;
 
 import org.apache.lucene.index.VectorSimilarityFunction;
-import org.apache.lucene.search.Explanation;
 import org.elasticsearch.index.mapper.vectors.DenseVectorFieldMapper;
 import org.elasticsearch.search.SearchHit;
 import org.elasticsearch.search.SearchHits;
@@ -33,63 +32,60 @@ public SearchHits diversify(SearchHits hits, ResultDiversificationContext divers
         }
 
         MMRResultDiversificationContext context = (MMRResultDiversificationContext) diversificationContext;
-        SearchHit[] docs = hits.getHits();  // NOTE: by reference, not new array
+        SearchHit[] searchHits = hits.getHits();  // NOTE: by reference, not new array
 
-        if (docs.length == 0) {
+        if (searchHits.length == 0) {
             return hits;
         }
 
         Map<Integer, Integer> docIdIndexMapping = new HashMap<>();
-        Map<Integer, VectorData> fieldVectors = getFieldVectorsForHits(docs, context, docIdIndexMapping);
+        Map<Integer, VectorData> fieldVectors = getFieldVectorsForHits(searchHits, context, docIdIndexMapping);
 
         VectorSimilarityFunction similarityFunction = DenseVectorFieldMapper.VectorSimilarity.MAX_INNER_PRODUCT.vectorSimilarityFunction(
             context.getIndexVersion(),
             diversificationContext.getElementType()
         );
 
-        List<Integer> rerankedDocIds = new ArrayList<>();
-        Map<Integer, VectorData> selectedVectors = new HashMap<>();
+        // our chosen DocIDs to keep
+        List<Integer> selectedDocIds = new ArrayList<>();
 
         // always add the highest scoring doc to the list
-        int highestDocIdIndex = -1;
+        int highestScoreDocId = -1;
         float highestScore = Float.MIN_VALUE;
-        for (int i = 0; i < docs.length; i++) {
-            if (docs[i].getScore() > highestScore) {
-                highestDocIdIndex = i;
-                highestScore = docs[i].getScore();
+        for (SearchHit hit : searchHits) {
+            if (hit.getScore() > highestScore) {
+                highestScoreDocId = hit.docId();
+                highestScore = hit.getScore();
             }
         }
-        int firstDocId = docs[highestDocIdIndex].docId();
-        rerankedDocIds.add(firstDocId);
+        selectedDocIds.add(highestScoreDocId);
 
-        // and add the vector for the first items
-        VectorData firstVec = fieldVectors.get(firstDocId);
-        selectedVectors.put(firstDocId, firstVec);
+        // test the vector to see if we are using floats or bytes
+        VectorData firstVec = fieldVectors.get(highestScoreDocId);
         boolean useFloat = firstVec.isFloat();
 
-        // cache the similarity scores for the query vector vs. docs
-        Map<Integer, Float> querySimilarity = getQuerySimilarityForDocs(docs, fieldVectors, similarityFunction, useFloat, context);
+        // cache the similarity scores for the query vector vs. searchHits
+        Map<Integer, Float> querySimilarity = getQuerySimilarityForDocs(searchHits, fieldVectors, similarityFunction, useFloat, context);
 
         Map<Integer, Map<Integer, Float>> cachedSimilarities = new HashMap<>();
         int numCandidates = context.getNumCandidates();
 
-        for (int x = 0; x < numCandidates && rerankedDocIds.size() < numCandidates && rerankedDocIds.size() < docs.length; x++) {
+        for (int x = 0; x < numCandidates && selectedDocIds.size() < numCandidates && selectedDocIds.size() < searchHits.length; x++) {
             int thisMaxMMRDocId = -1;
-            float thisMaxMMRScore = Float.MIN_VALUE;
-            for (SearchHit thisHit : docs) {
+            float thisMaxMMRScore = Float.NEGATIVE_INFINITY;
+            for (SearchHit thisHit : searchHits) {
                 int docId = thisHit.docId();
 
-                if (rerankedDocIds.contains(docId)) {
+                if (selectedDocIds.contains(docId)) {
                     continue;
                 }
 
                 var thisDocVector = fieldVectors.get(docId);
-
                 var cachedScoresForDoc = cachedSimilarities.getOrDefault(docId, new HashMap<>());
 
-                // compute MMR scores for remaining docs
+                // compute MMR scores for remaining searchHits
                 float highestMMRScore = getHighestScoreForSelectedVectors(
-                    selectedVectors,
+                    fieldVectors,
                     similarityFunction,
                     useFloat,
                     thisDocVector,
@@ -108,15 +104,16 @@ public SearchHits diversify(SearchHits hits, ResultDiversificationContext divers
                 cachedSimilarities.put(docId, cachedScoresForDoc);
             }
 
-            rerankedDocIds.add(thisMaxMMRDocId);
-            selectedVectors.put(thisMaxMMRDocId, fieldVectors.get(thisMaxMMRDocId));
+            if (thisMaxMMRDocId >= 0) {
+                selectedDocIds.add(thisMaxMMRDocId);
+            }
         }
 
-        // our return should be only those docs that are selected
-        SearchHit[] ret = new SearchHit[rerankedDocIds.size()];
-        for (int i = 0; i < rerankedDocIds.size(); i++) {
-            int scoredDocIndex = docIdIndexMapping.get(rerankedDocIds.get(i));
-            ret[i] = docs[scoredDocIndex];
+        // our return should be only those searchHits that are selected
+        SearchHit[] ret = new SearchHit[selectedDocIds.size()];
+        for (int i = 0; i < selectedDocIds.size(); i++) {
+            int scoredDocIndex = docIdIndexMapping.get(selectedDocIds.get(i));
+            ret[i] = searchHits[scoredDocIndex];
         }
 
         return new SearchHits(
@@ -129,13 +126,6 @@ public SearchHits diversify(SearchHits hits, ResultDiversificationContext divers
         );
     }
 
-    @Override
-    public Explanation explain(int topLevelDocId, ResultDiversificationContext diversificationContext, Explanation sourceExplanation)
-        throws IOException {
-        // TODO
-        return null;
-    }
-
     private float getHighestScoreForSelectedVectors(
         Map<Integer, VectorData> selectedVectors,
         VectorSimilarityFunction similarityFunction,
@@ -163,15 +153,15 @@ private float getHighestScoreForSelectedVectors(
     }
 
     protected Map<Integer, Float> getQuerySimilarityForDocs(
-        SearchHit[] docs,
+        SearchHit[] searchHits,
         Map<Integer, VectorData> fieldVectors,
         VectorSimilarityFunction similarityFunction,
         boolean useFloat,
         ResultDiversificationContext context
     ) {
         Map<Integer, Float> querySimilarity = new HashMap<>();
-        for (int i = 0; i < docs.length; i++) {
-            int docId = docs[i].docId();
+        for (SearchHit searchHit : searchHits) {
+            int docId = searchHit.docId();
             VectorData vectorData = fieldVectors.get(docId);
             if (vectorData != null) {
                 float querySimilarityScore = getVectorComparisonScore(similarityFunction, useFloat, vectorData, context.getQueryVector());
diff --git a/server/src/test/java/org/elasticsearch/search/diversification/mmr/MMRResultDiversificationTests.java b/server/src/test/java/org/elasticsearch/search/diversification/mmr/MMRResultDiversificationTests.java
@@ -42,7 +42,7 @@ public void testMMRDiversification() throws IOException {
         var queryVectorData = new VectorData(new float[] { 0.5f, 0.2f, 0.4f, 0.4f });
         var diversificationContext = new MMRResultDiversificationContext(
             "dense_vector_field",
-            0.6f,
+            0.3f,
             3,
             queryVectorData,
             fieldMapper,
@@ -54,8 +54,8 @@ public void testMMRDiversification() throws IOException {
             generateSearchHit(2, 1.8f, 2, new float[] { 0.4f, 0.2f, 0.3f, 0.3f }),
             generateSearchHit(3, 1.6f, 3, new float[] { 0.4f, 0.1f, 0.3f, 0.3f }),
             generateSearchHit(4, 1.0f, 4, new float[] { 0.1f, 0.9f, 0.5f, 0.9f }),
-            generateSearchHit(5, 0.9f, 5, new float[] { 0.1f, 0.9f, 0.5f, 0.8f }),
-            generateSearchHit(6, 0.5f, 6, new float[] { 0.05f, 0.05f, 0.05f, 0.05f }) };
+            generateSearchHit(5, 0.8f, 5, new float[] { 0.1f, 0.9f, 0.5f, 0.9f }),
+            generateSearchHit(6, 0.8f, 6, new float[] { 0.05f, 0.05f, 0.05f, 0.05f }) };
 
         TotalHits totalHits = new TotalHits(6L, TotalHits.Relation.EQUAL_TO);
         SearchHits searchHits = new SearchHits(hits, totalHits, 2.0f);
@@ -66,7 +66,7 @@ public void testMMRDiversification() throws IOException {
 
         assertEquals(3, diversifiedHits.getHits().length);
         assertEquals(1, diversifiedHits.getHits()[0].docId());
-        assertEquals(4, diversifiedHits.getHits()[1].docId());
+        assertEquals(6, diversifiedHits.getHits()[1].docId());
         assertEquals(3, diversifiedHits.getHits()[2].docId());
     }