apache · vigyasharma · Jun 28, 2025 · Nov 14, 2024 · Nov 15, 2024 · Nov 15, 2024
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
@@ -25,6 +25,7 @@ API Changes
 New Features
 ---------------------
 * GITHUB#14097: Binary partitioning merge policy over float-valued vector field. (Mike Sokolov)
+* GITHUB#14009: Add a new Query that can rescore other Query based on a generic DoubleValueSource and trim the results down to top N (Anh Dung Bui)
 
 Improvements
 ---------------------
@@ -857,7 +858,7 @@ Improvements
 
 * GITHUB#13285: Early terminate graph searches of AbstractVectorSimilarityQuery to follow timeout set from
   IndexSearcher#setTimeout(QueryTimeout). (Kaival Parikh)
-  
+
 * GITHUB#13633: Add ability to read/write knn vector values to a MemoryIndex. (Ben Trent)
 
 * GITHUB#12627: patch HNSW graphs to improve reachability of all nodes from entry points
@@ -1774,7 +1775,7 @@ New Features
   closed while queries are running can no longer crash the JVM. To disable this feature,
   pass the following sysprop on Java command line:
   "-Dorg.apache.lucene.store.MMapDirectory.enableMemorySegments=false" (Uwe Schindler)
-  
+
 * GITHUB#12252 Add function queries for computing similarity scores between knn vectors. (Elia Porciani, Alessandro Benedetti)
 
 Improvements
@@ -2453,7 +2454,7 @@ New Features
 * LUCENE-10385: Implement Weight#count on IndexSortSortedNumericDocValuesRangeQuery
   to speed up computing the number of hits when possible. (Lu Xugang, Luca Cavanna, Adrien Grand)
 
-* LUCENE-10422: Monitor Improvements: `Monitor` can use a custom `Directory` 
+* LUCENE-10422: Monitor Improvements: `Monitor` can use a custom `Directory`
   implementation. `Monitor` can be created with a readonly `QueryIndex` in order to
   have readonly `Monitor` instances. (Niko Usai)
 
@@ -2512,7 +2513,7 @@ Optimizations
   term of each block as a dictionary when compressing suffixes of the other 63
   terms of the block. (Adrien Grand)
 
-* LUCENE-10411: Add nearest neighbors vectors support to ExitableDirectoryReader. 
+* LUCENE-10411: Add nearest neighbors vectors support to ExitableDirectoryReader.
   (Zach Chen, Adrien Grand, Julie Tibshirani, Tomoko Uchida)
 
 * LUCENE-10542: FieldSource exists implementations can avoid value retrieval (Kevin Risden)
@@ -2677,7 +2678,7 @@ New Features
   points are indexed.
   (Quentin Pradet, Adrien Grand)
 
-* LUCENE-10263: Added Weight#count to NormsFieldExistsQuery to speed up the query if all 
+* LUCENE-10263: Added Weight#count to NormsFieldExistsQuery to speed up the query if all
   documents have the field.. (Alan Woodward)
 
 * LUCENE-10248: Add SpanishPluralStemFilter, for precise stemming of Spanish plurals.
@@ -2703,14 +2704,14 @@ New Features
 
 * LUCENE-10403: Add ArrayUtil#grow(T[]). (Greg Miller)
 
-* LUCENE-10414: Add fn:fuzzyTerm interval function to flexible query parser (Dawid Weiss, 
+* LUCENE-10414: Add fn:fuzzyTerm interval function to flexible query parser (Dawid Weiss,
   Alan Woodward)
-  
+
 * LUCENE-10378: Implement Weight#count for PointRangeQuery to provide a faster way to calculate
   the number of matching range docs when each doc has at-most one point and the points are 1-dimensional.
   (Gautam Worah, Ignacio Vera, Adrien Grand)
 
-* LUCENE-10415: FunctionScoreQuery and IndexOrDocValuesQuery delegate Weight#count. (Ignacio Vera)     
+* LUCENE-10415: FunctionScoreQuery and IndexOrDocValuesQuery delegate Weight#count. (Ignacio Vera)
 
 * LUCENE-10382: Add support for filtering in KnnVectorQuery. This allows for finding the
   nearest k documents that also match a query. (Julie Tibshirani, Joel Bernstein)
@@ -2727,10 +2728,10 @@ Improvements
 
 * LUCENE-10238: Upgrade icu4j dependency to 70.1. (Dawid Weiss)
 
-* LUCENE-9820: Extract BKD tree interface and move intersecting logic to the 
+* LUCENE-9820: Extract BKD tree interface and move intersecting logic to the
   PointValues abstract class. (Ignacio Vera, Adrien Grand)
-  
-* LUCENE-10262: Lift up restrictions for navigating PointValues#PointTree 
+
+* LUCENE-10262: Lift up restrictions for navigating PointValues#PointTree
   added in LUCENE-9820 (Ignacio Vera)
 
 * LUCENE-9538: Detect polygon self-intersections in the Tessellator. (Ignacio Vera)
@@ -2845,8 +2846,8 @@ Bug Fixes
 
 * LUCENE-10407: Containing intervals could sometimes yield incorrect matches when wrapped
   in a disjunction. (Alan Woodward, Dawid Weiss)
-  
-* LUCENE-10405: When using the MemoryIndex, binary and Sorted doc values are stored 
+
+* LUCENE-10405: When using the MemoryIndex, binary and Sorted doc values are stored
    as BytesRef instead of BytesRefHash so they don't have a limit on size. (Ignacio Vera)
 
 * LUCENE-10428: Queries with a misbehaving score function may no longer cause
@@ -2878,7 +2879,7 @@ Other
 
 * LUCENE-10413: Make Ukrainian default stop words list available as a public getter. (Alan Woodward)
 
-* LUCENE-10437: Polygon tessellator throws a more informative error message when the provided polygon 
+* LUCENE-10437: Polygon tessellator throws a more informative error message when the provided polygon
   does not contain enough no-collinear points. (Ignacio Vera)
 
 ======================= Lucene 9.0.0 =======================
@@ -2997,7 +2998,7 @@ API Changes
   only applicable for fields that are indexed with doc values only. (Mayya Sharipova,
   Adrien Grand, Simon Willnauer)
 
-* LUCENE-9047: Directory API is now little endian. (Ignacio Vera, Adrien Grand)  
+* LUCENE-9047: Directory API is now little endian. (Ignacio Vera, Adrien Grand)
 
 * LUCENE-9948: No longer require the user to specify whether-or-not a field is multi-valued in
   LongValueFacetCounts (detect automatically based on what is indexed). (Greg Miller)
@@ -3210,7 +3211,7 @@ Improvements
   (David Smiley)
 
 * LUCENE-10062: Switch taxonomy faceting to use numeric doc values for storing ordinals instead of binary doc values
-  with its own custom encoding. (Greg Miller) 
+  with its own custom encoding. (Greg Miller)
 
 Bug fixes
 ---------------------
@@ -3333,10 +3334,10 @@ Other
 * LUCENE-9822: Add assertion to PFOR exception encoding, documenting the BLOCK_SIZE assumption. (Greg Miller)
 
 * LUCENE-9883: Turn on ecj missingEnumCaseDespiteDefault setting. (Zach Chen)
- 
-* LUCENE-9705: Make new versions of all index formats for the Lucene90 codec and move 
-  the existing ones to the backwards codecs. (Julie Tibshirani, Ignacio Vera)  
-  
+
+* LUCENE-9705: Make new versions of all index formats for the Lucene90 codec and move
+  the existing ones to the backwards codecs. (Julie Tibshirani, Ignacio Vera)
+
 * LUCENE-9907: Remove dependency on PackedInts#getReader() from the current codecs and move the
   method to backwards codec. (Ignacio Vera)
 

diff --git a/lucene/core/src/java/org/apache/lucene/search/AbstractKnnVectorQuery.java b/lucene/core/src/java/org/apache/lucene/search/AbstractKnnVectorQuery.java
@@ -387,7 +387,7 @@ public KnnCollector newCollector(
     }
   }
 
-  protected Query createRewrittenQuery(IndexReader reader, TopDocs topK, int reentryCount) {
+  static Query createRewrittenQuery(IndexReader reader, TopDocs topK, int reentryCount) {
     int len = topK.scoreDocs.length;
     assert len > 0;
     float maxScore = topK.scoreDocs[0].score;

diff --git a/lucene/core/src/java/org/apache/lucene/search/RescoreTopNQuery.java b/lucene/core/src/java/org/apache/lucene/search/RescoreTopNQuery.java
@@ -0,0 +1,128 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search;
+
+import java.io.IOException;
+import java.util.Objects;
+import org.apache.lucene.index.IndexReader;
+
+/**
+ * A Query that re-scores another Query with a DoubleValueSource function and cut-off the results at
+ * top N.
+ *
+ * @lucene.experimental
+ */
+public class RescoreTopNQuery extends Query {
+
+  private final int n;
+  private final Query query;
+  private final DoubleValuesSource valuesSource;
+
+  /**
+   * Execute the inner Query, re-score using a customizable DoubleValueSource and trim down the
+   * result to k
+   *
+   * @param query the query to execute as initial phase
+   * @param valuesSource the double value source to re-score
+   * @param n the number of documents to find
+   * @throws IllegalArgumentException if <code>n</code> is less than 1
+   */
+  public RescoreTopNQuery(Query query, DoubleValuesSource valuesSource, int n) {
+    if (n < 1) {
+      throw new IllegalArgumentException("n must be >= 1");
+    }
+    this.query = query;
+    this.valuesSource = valuesSource;
+    this.n = n;
+  }
+
+  @Override
+  public Query rewrite(IndexSearcher indexSearcher) throws IOException {
+    DoubleValuesSource rewrittenValueSource = valuesSource.rewrite(indexSearcher);
+    IndexReader reader = indexSearcher.getIndexReader();
+    Query rewritten = indexSearcher.rewrite(query);
+    Weight weight = indexSearcher.createWeight(rewritten, ScoreMode.COMPLETE_NO_SCORES, 1.0f);
+    HitQueue queue = new HitQueue(n, false);
+    for (var leaf : reader.leaves()) {
+      Scorer innerScorer = weight.scorer(leaf);
+      if (innerScorer == null) {
+        continue;
+      }
+      DoubleValues rescores = rewrittenValueSource.getValues(leaf, getDoubleValues(innerScorer));
+      DocIdSetIterator iterator = innerScorer.iterator();
+      while (iterator.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
+        int docId = iterator.docID();
+        if (rescores.advanceExact(docId)) {
+          double v = rescores.doubleValue();
+          queue.insertWithOverflow(new ScoreDoc(leaf.docBase + docId, (float) v));
+        } else {
+          queue.insertWithOverflow(new ScoreDoc(leaf.docBase + docId, 0f));
+        }
+      }
+    }
+    int i = 0;
+    ScoreDoc[] scoreDocs = new ScoreDoc[queue.size()];
+    for (ScoreDoc topDoc : queue) {
+      scoreDocs[i++] = topDoc;
+    }
+    TopDocs topDocs =
+        new TopDocs(new TotalHits(queue.size(), TotalHits.Relation.EQUAL_TO), scoreDocs);
+    return KnnFloatVectorQuery.createRewrittenQuery(reader, topDocs, 0);
+  }
+
+  private DoubleValues getDoubleValues(Scorer innerScorer) {
+    // if the value source doesn't need document score to compute value, return null
+    if (valuesSource.needsScores() == false) {
+      return null;
+    }
+    return DoubleValuesSource.fromScorer(innerScorer);
+  }
+
+  @Override
+  public int hashCode() {
+    int result = valuesSource.hashCode();
+    result = 31 * result + Objects.hash(query, n);
+    return result;
+  }
+
+  @Override
+  public boolean equals(Object o) {
+    if (this == o) return true;
+    if (o == null || getClass() != o.getClass()) return false;
+    RescoreTopNQuery that = (RescoreTopNQuery) o;
+    return Objects.equals(query, that.query)
+        && Objects.equals(valuesSource, that.valuesSource)
+        && n == that.n;
+  }
+
+  @Override
+  public void visit(QueryVisitor visitor) {
+    query.visit(visitor);
+  }
+
+  @Override
+  public String toString(String field) {
+    return getClass().getSimpleName()
+        + ":"
+        + query.toString(field)
+        + ":"
+        + valuesSource.toString()
+        + "["
+        + n
+        + "]";
+  }
+}