From cd8201c25f0d50949bc8931eca5e7e8a4f72be0b Mon Sep 17 00:00:00 2001 From: vigyasharma Date: Wed, 11 Jun 2025 17:40:36 -0700 Subject: [PATCH 1/5] doubleValuesSource rescorer; no explain support --- .../search/DoubleValuesSourceRescorer.java | 93 +++++++++++ .../org/apache/lucene/search/ScoreDoc.java | 13 ++ .../TestDoubleValuesSourceRescorer.java | 147 ++++++++++++++++++ 3 files changed, 253 insertions(+) create mode 100644 lucene/core/src/java/org/apache/lucene/search/DoubleValuesSourceRescorer.java create mode 100644 lucene/core/src/test/org/apache/lucene/search/TestDoubleValuesSourceRescorer.java diff --git a/lucene/core/src/java/org/apache/lucene/search/DoubleValuesSourceRescorer.java b/lucene/core/src/java/org/apache/lucene/search/DoubleValuesSourceRescorer.java new file mode 100644 index 000000000000..a7528074c34c --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/search/DoubleValuesSourceRescorer.java @@ -0,0 +1,93 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search; + +import java.io.IOException; +import java.util.Arrays; +import java.util.List; +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.util.ArrayUtil; + +public abstract class DoubleValuesSourceRescorer extends Rescorer { + + final DoubleValuesSource valuesSource; + + public DoubleValuesSourceRescorer(DoubleValuesSource valuesSource) { + this.valuesSource = valuesSource; + } + + /** + * Implement this in a subclass to combine the first pass scores with values from the + * DoubleValuesSource + * + * @param firstPassScore Score from firstPassTopDocs + * @param valuePresent true if DoubleValuesSource has a value for the hit from first pass + * @param sourceValue Value returned from DoubleValuesSource + */ + protected abstract float combine(float firstPassScore, boolean valuePresent, double sourceValue); + + @Override + public TopDocs rescore(IndexSearcher searcher, TopDocs firstPassTopDocs, int topN) + throws IOException { + DoubleValuesSource source = valuesSource.rewrite(searcher); + ScoreDoc[] hits = firstPassTopDocs.scoreDocs.clone(); + Arrays.sort(hits, (a, b) -> a.doc - b.doc); + + List leaves = searcher.getIndexReader().leaves(); + LeafReaderContext ctx = leaves.getFirst(); + int currLeaf = 0; + int leafEnd = ctx.docBase + ctx.reader().maxDoc(); + + // find leaf holding this doc + for (ScoreDoc hit : hits) { + while (hit.doc >= leafEnd) { + if (currLeaf == leaves.size() - 1) { + throw new IllegalStateException( + "hit docId=" + + hit.doc + + "greater than last searcher leaf maxDoc=" + + leafEnd + + " Ensure firstPassTopDocs were produced by the searcher provided to rescore."); + } + ctx = leaves.get(++currLeaf); + leafEnd = ctx.docBase + ctx.reader().maxDoc(); + } + + int targetDoc = hit.doc - ctx.docBase; + DoubleValues values = source.getValues(ctx, null); + boolean scorePresent = values.advanceExact(targetDoc); + double secondPassScore = scorePresent ? values.doubleValue() : 0.0f; + hit.score = combine(hit.score, scorePresent, secondPassScore); + } + + if (topN < hits.length) { + ArrayUtil.select(hits, 0, hits.length, topN, ScoreDoc.scoreDocComparator); + ScoreDoc[] subset = new ScoreDoc[topN]; + System.arraycopy(hits, 0, subset, 0, topN); + hits = subset; + } + Arrays.sort(hits, ScoreDoc.scoreDocComparator); + + return new TopDocs(firstPassTopDocs.totalHits, hits); + } + + @Override + public Explanation explain(IndexSearcher searcher, Explanation firstPassExplanation, int docID) + throws IOException { + return null; + } +} diff --git a/lucene/core/src/java/org/apache/lucene/search/ScoreDoc.java b/lucene/core/src/java/org/apache/lucene/search/ScoreDoc.java index a4600c974db0..19328b124a11 100644 --- a/lucene/core/src/java/org/apache/lucene/search/ScoreDoc.java +++ b/lucene/core/src/java/org/apache/lucene/search/ScoreDoc.java @@ -16,6 +16,7 @@ */ package org.apache.lucene.search; +import java.util.Comparator; import org.apache.lucene.index.StoredFields; /** Holds one hit in {@link TopDocs}. */ @@ -51,4 +52,16 @@ public ScoreDoc(int doc, float score, int shardIndex) { public String toString() { return "doc=" + doc + " score=" + score + " shardIndex=" + shardIndex; } + + /** Utility comparator that sorts by score descending, then by docId ascending */ + public static final Comparator scoreDocComparator = + (a, b) -> { + if (a.score > b.score) { + return -1; + } else if (a.score < b.score) { + return 1; + } else { + return a.doc - b.doc; + } + }; } diff --git a/lucene/core/src/test/org/apache/lucene/search/TestDoubleValuesSourceRescorer.java b/lucene/core/src/test/org/apache/lucene/search/TestDoubleValuesSourceRescorer.java new file mode 100644 index 000000000000..a63c6738e583 --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/search/TestDoubleValuesSourceRescorer.java @@ -0,0 +1,147 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search; + +import java.util.Arrays; +import java.util.List; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.NumericDocValuesField; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.Term; +import org.apache.lucene.store.Directory; +import org.apache.lucene.tests.index.RandomIndexWriter; +import org.apache.lucene.tests.util.LuceneTestCase; + +public class TestDoubleValuesSourceRescorer extends LuceneTestCase { + + private static final String ID_FIELD = "id"; + private static final String DOC_VAL_FIELD = "docVal"; + private static final String DOC_VAL_STORED_FIELD = "storedDocVal"; + + private final DoubleValuesSource doubleValuesSource = + DoubleValuesSource.fromIntField(DOC_VAL_FIELD); + + private final DoubleValuesSourceRescorer rescorer = + new DoubleValuesSourceRescorer(doubleValuesSource) { + @Override + protected float combine(float firstPassScore, boolean valuePresent, double sourceValue) { + return valuePresent ? (float) sourceValue : 0f; + } + }; + + private static final List dictionary = + Arrays.asList( + "river", "quick", "brown", "fox", "jumped", "lazy", "fence", "wizard", "of", "a", "an", + "the", "cookie", "golf", "golden", "tennis", "boy", "plays", "likes", "wants"); + + String randomSentence() { + final int length = random().nextInt(3, 10); + StringBuilder sentence = new StringBuilder(); + for (int i = 0; i < length; i++) { + sentence.append(dictionary.get(random().nextInt(dictionary.size() - 1)) + " "); + } + return sentence.toString(); + } + + private void publishDocs(int numDocs, String fieldName, boolean indexDocValues, Directory dir) + throws Exception { + RandomIndexWriter w = new RandomIndexWriter(random(), dir, newIndexWriterConfig()); + for (int i = 0; i < numDocs; i++) { + Document d = new Document(); + d.add(newStringField(ID_FIELD, Integer.toString(i), Field.Store.YES)); + d.add(newTextField(fieldName, randomSentence(), Field.Store.NO)); + if (indexDocValues) { + int val = i + 100; + d.add(new NumericDocValuesField(DOC_VAL_FIELD, val)); + d.add(newStringField(DOC_VAL_STORED_FIELD, Integer.toString(val), Field.Store.YES)); + } + w.addDocument(d); + } + w.close(); + } + + public void testBasic() throws Exception { + try (Directory dir = newDirectory()) { + publishDocs(random().nextInt(100), "title", true, dir); + try (IndexReader r = DirectoryReader.open(dir)) { + IndexSearcher s = new IndexSearcher(r); + TermQuery query = + new TermQuery( + new Term("title", dictionary.get(random().nextInt(dictionary.size() - 1)))); + TopDocs queryHits = s.search(query, 50); + TopDocs rescoredHits = rescorer.rescore(s, queryHits, 15); + assertTrue(rescoredHits.scoreDocs.length <= 15); + assertEquals(queryHits.totalHits, rescoredHits.totalHits); + for (int i = 1; i < rescoredHits.scoreDocs.length; i++) { + assertTrue(rescoredHits.scoreDocs[i - 1].score > rescoredHits.scoreDocs[i].score); + } + for (ScoreDoc hit : rescoredHits.scoreDocs) { + assertEquals( + s.storedFields().document(hit.doc).get(DOC_VAL_STORED_FIELD), + Integer.toString((int) hit.score)); + } + } + } + } + + public void testSubsetAndIdempotency() throws Exception { + try (Directory dir = newDirectory()) { + publishDocs(random().nextInt(60, 200), "title", true, dir); + try (IndexReader r = DirectoryReader.open(dir)) { + IndexSearcher s = new IndexSearcher(r); + TermQuery query = + new TermQuery( + new Term("title", dictionary.get(random().nextInt(dictionary.size() - 1)))); + TopDocs queryHits = s.search(query, 50); + TopDocs rescoredHits1 = rescorer.rescore(s, queryHits, 15); + + int hits1Len = rescoredHits1.scoreDocs.length; + int hit2N = Math.max(hits1Len / 2, 1); + TopDocs rescoredHits2 = rescorer.rescore(s, queryHits, hit2N); + assertEquals(hit2N, rescoredHits2.scoreDocs.length); + for (int i = 0; i < hit2N; i++) { + assertEquals(rescoredHits1.scoreDocs[i].doc, rescoredHits2.scoreDocs[i].doc); + assertEquals(rescoredHits1.scoreDocs[i].score, rescoredHits2.scoreDocs[i].score, 1e-5); + } + } + } + } + + public void testMissingValues() throws Exception { + try (Directory dir = newDirectory()) { + publishDocs(random().nextInt(60, 200), "title", false, dir); + try (IndexReader r = DirectoryReader.open(dir)) { + IndexSearcher s = new IndexSearcher(r); + TermQuery query = + new TermQuery( + new Term("title", dictionary.get(random().nextInt(dictionary.size() - 1)))); + TopDocs queryHits = s.search(query, 50); + TopDocs rescoredHits = rescorer.rescore(s, queryHits, 15); + assertTrue(rescoredHits.scoreDocs.length <= 15); + assertEquals(queryHits.totalHits, rescoredHits.totalHits); + for (int i = 0; i < rescoredHits.scoreDocs.length; i++) { + assertEquals(rescoredHits.scoreDocs[i].score, 0f, 1e-5); + if (i > 0) { + assertTrue(rescoredHits.scoreDocs[i - 1].doc < rescoredHits.scoreDocs[i].doc); + } + } + } + } + } +} From 56bc5dc113d130860e88d71499dcf8c4f00a7b8f Mon Sep 17 00:00:00 2001 From: vigyasharma Date: Thu, 12 Jun 2025 13:07:41 -0700 Subject: [PATCH 2/5] add explain impl --- .../search/DoubleValuesSourceRescorer.java | 41 ++++++++++++++++++- .../TestDoubleValuesSourceRescorer.java | 14 +++++++ 2 files changed, 54 insertions(+), 1 deletion(-) diff --git a/lucene/core/src/java/org/apache/lucene/search/DoubleValuesSourceRescorer.java b/lucene/core/src/java/org/apache/lucene/search/DoubleValuesSourceRescorer.java index a7528074c34c..4164d68b7543 100644 --- a/lucene/core/src/java/org/apache/lucene/search/DoubleValuesSourceRescorer.java +++ b/lucene/core/src/java/org/apache/lucene/search/DoubleValuesSourceRescorer.java @@ -44,6 +44,7 @@ public DoubleValuesSourceRescorer(DoubleValuesSource valuesSource) { public TopDocs rescore(IndexSearcher searcher, TopDocs firstPassTopDocs, int topN) throws IOException { DoubleValuesSource source = valuesSource.rewrite(searcher); + // this will still alter scores, we clone to retain hits ordering in firstPassTopDocs ScoreDoc[] hits = firstPassTopDocs.scoreDocs.clone(); Arrays.sort(hits, (a, b) -> a.doc - b.doc); @@ -88,6 +89,44 @@ public TopDocs rescore(IndexSearcher searcher, TopDocs firstPassTopDocs, int top @Override public Explanation explain(IndexSearcher searcher, Explanation firstPassExplanation, int docID) throws IOException { - return null; + Explanation first = + Explanation.match( + firstPassExplanation.getValue(), "first pass score", firstPassExplanation); + + LeafReaderContext leafWithDoc = null; + for (LeafReaderContext ctx : searcher.getIndexReader().leaves()) { + if (docID >= ctx.docBase && docID < (ctx.docBase + ctx.reader().maxDoc())) { + leafWithDoc = ctx; + break; + } + } + if (leafWithDoc == null) { + throw new IllegalArgumentException( + "docId=" + docID + " not found in any leaf in provided searcher"); + } + + DoubleValuesSource source = valuesSource.rewrite(searcher); + Explanation doubleValuesMatch = + source.explain( + leafWithDoc, + docID - leafWithDoc.docBase, + Explanation.noMatch("DoubleValuesSource was not initialized with query scores")); + Explanation second = + doubleValuesMatch.isMatch() + ? Explanation.match( + doubleValuesMatch.getValue(), "value from DoubleValuesSource", doubleValuesMatch) + : Explanation.noMatch("no value in DoubleValuesSource"); + + float score = + combine( + first.getValue().floatValue(), + doubleValuesMatch.isMatch(), + doubleValuesMatch.getValue().doubleValue()); + String desc = + "combined score from firstPass and DoubleValuesSource=" + + source.getClass() + + " using " + + getClass(); + return Explanation.match(score, desc, first, second); } } diff --git a/lucene/core/src/test/org/apache/lucene/search/TestDoubleValuesSourceRescorer.java b/lucene/core/src/test/org/apache/lucene/search/TestDoubleValuesSourceRescorer.java index a63c6738e583..86b1095e86c2 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestDoubleValuesSourceRescorer.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestDoubleValuesSourceRescorer.java @@ -96,6 +96,13 @@ public void testBasic() throws Exception { s.storedFields().document(hit.doc).get(DOC_VAL_STORED_FIELD), Integer.toString((int) hit.score)); } + int doc = rescoredHits.scoreDocs[0].doc; + Explanation e = rescorer.explain(s, s.explain(query, doc), doc); + String msg = e.toString(); + assertTrue(msg.contains("combined score from firstPass and DoubleValuesSource")); + assertTrue(msg.contains(getClass().toString())); + assertTrue(msg.contains("first pass score")); + assertTrue(msg.contains("value from DoubleValuesSource")); } } } @@ -141,6 +148,13 @@ public void testMissingValues() throws Exception { assertTrue(rescoredHits.scoreDocs[i - 1].doc < rescoredHits.scoreDocs[i].doc); } } + int doc = rescoredHits.scoreDocs[0].doc; + Explanation e = rescorer.explain(s, s.explain(query, doc), doc); + String msg = e.toString(); + assertTrue(msg.contains("combined score from firstPass and DoubleValuesSource")); + assertTrue(msg.contains(getClass().toString())); + assertTrue(msg.contains("first pass score")); + assertTrue(msg.contains("no value in DoubleValuesSource")); } } } From 740ef843780ba63f817cbf9688d1d6a4a06f2e4e Mon Sep 17 00:00:00 2001 From: vigyasharma Date: Thu, 12 Jun 2025 13:21:29 -0700 Subject: [PATCH 3/5] doc string --- .../org/apache/lucene/search/DoubleValuesSourceRescorer.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lucene/core/src/java/org/apache/lucene/search/DoubleValuesSourceRescorer.java b/lucene/core/src/java/org/apache/lucene/search/DoubleValuesSourceRescorer.java index 4164d68b7543..03680935d900 100644 --- a/lucene/core/src/java/org/apache/lucene/search/DoubleValuesSourceRescorer.java +++ b/lucene/core/src/java/org/apache/lucene/search/DoubleValuesSourceRescorer.java @@ -22,9 +22,10 @@ import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.util.ArrayUtil; +/** A {@link Rescorer} that uses provided DoubleValuesSource to rescore first pass hits. */ public abstract class DoubleValuesSourceRescorer extends Rescorer { - final DoubleValuesSource valuesSource; + private final DoubleValuesSource valuesSource; public DoubleValuesSourceRescorer(DoubleValuesSource valuesSource) { this.valuesSource = valuesSource; From 0ae7ad367f6ad22462dfef9724e1c9a4fd1fd8bc Mon Sep 17 00:00:00 2001 From: vigyasharma Date: Sat, 14 Jun 2025 12:42:25 -0700 Subject: [PATCH 4/5] resolve changes conflict --- lucene/CHANGES.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 0b0596dc2815..6f547169ea88 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -92,6 +92,9 @@ New Features * GITHUB#14565: Add ParentsChildrenBlockJoinQuery that supports parent and child filter in the same query along with limiting number of child documents to retrieve per parent. (Jinny Wang) +* GITHUB#14776: Add a Rescorer that uses values from provided DoubleValuesSource to re-score + first pass hits. (Vigya Sharma) + Improvements --------------------- * GITHUB#14458: Add an IndexDeletion policy that retains the last N commits. (Owais Kazi) From 0a42feeca77d9c59efa93c16dba8a124faf87a00 Mon Sep 17 00:00:00 2001 From: vigyasharma Date: Sat, 14 Jun 2025 12:47:22 -0700 Subject: [PATCH 5/5] rename comparator in scoredoc --- .../org/apache/lucene/search/DoubleValuesSourceRescorer.java | 4 ++-- lucene/core/src/java/org/apache/lucene/search/ScoreDoc.java | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/search/DoubleValuesSourceRescorer.java b/lucene/core/src/java/org/apache/lucene/search/DoubleValuesSourceRescorer.java index 03680935d900..8b1cb3445495 100644 --- a/lucene/core/src/java/org/apache/lucene/search/DoubleValuesSourceRescorer.java +++ b/lucene/core/src/java/org/apache/lucene/search/DoubleValuesSourceRescorer.java @@ -77,12 +77,12 @@ public TopDocs rescore(IndexSearcher searcher, TopDocs firstPassTopDocs, int top } if (topN < hits.length) { - ArrayUtil.select(hits, 0, hits.length, topN, ScoreDoc.scoreDocComparator); + ArrayUtil.select(hits, 0, hits.length, topN, ScoreDoc.COMPARATOR); ScoreDoc[] subset = new ScoreDoc[topN]; System.arraycopy(hits, 0, subset, 0, topN); hits = subset; } - Arrays.sort(hits, ScoreDoc.scoreDocComparator); + Arrays.sort(hits, ScoreDoc.COMPARATOR); return new TopDocs(firstPassTopDocs.totalHits, hits); } diff --git a/lucene/core/src/java/org/apache/lucene/search/ScoreDoc.java b/lucene/core/src/java/org/apache/lucene/search/ScoreDoc.java index 19328b124a11..2279de4560ed 100644 --- a/lucene/core/src/java/org/apache/lucene/search/ScoreDoc.java +++ b/lucene/core/src/java/org/apache/lucene/search/ScoreDoc.java @@ -54,7 +54,7 @@ public String toString() { } /** Utility comparator that sorts by score descending, then by docId ascending */ - public static final Comparator scoreDocComparator = + public static final Comparator COMPARATOR = (a, b) -> { if (a.score > b.score) { return -1;