Skip to content

Commit 02f5dc3

Browse files
committed
Add a rescorer that uses DoubleValuesSource values to re-score first pass hits (#14776)
1 parent 5705169 commit 02f5dc3

File tree

4 files changed

+310
-0
lines changed

4 files changed

+310
-0
lines changed

lucene/CHANGES.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,9 @@ New Features
2020
* GITHUB#14404: Introducing DocValuesMultiRangeQuery.SortedNumericStabbingBuilder into sandbox.
2121
(Mikhail Khludnev)
2222

23+
* GITHUB#14776: Add a Rescorer that uses values from provided DoubleValuesSource to re-score
24+
first pass hits. (Vigya Sharma)
25+
2326
Improvements
2427
---------------------
2528
* GITHUB#14458: Add an IndexDeletion policy that retains the last N commits. (Owais Kazi)
Lines changed: 133 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,133 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
package org.apache.lucene.search;
18+
19+
import java.io.IOException;
20+
import java.util.Arrays;
21+
import java.util.List;
22+
import org.apache.lucene.index.LeafReaderContext;
23+
import org.apache.lucene.util.ArrayUtil;
24+
25+
/** A {@link Rescorer} that uses provided DoubleValuesSource to rescore first pass hits. */
26+
public abstract class DoubleValuesSourceRescorer extends Rescorer {
27+
28+
private final DoubleValuesSource valuesSource;
29+
30+
public DoubleValuesSourceRescorer(DoubleValuesSource valuesSource) {
31+
this.valuesSource = valuesSource;
32+
}
33+
34+
/**
35+
* Implement this in a subclass to combine the first pass scores with values from the
36+
* DoubleValuesSource
37+
*
38+
* @param firstPassScore Score from firstPassTopDocs
39+
* @param valuePresent true if DoubleValuesSource has a value for the hit from first pass
40+
* @param sourceValue Value returned from DoubleValuesSource
41+
*/
42+
protected abstract float combine(float firstPassScore, boolean valuePresent, double sourceValue);
43+
44+
@Override
45+
public TopDocs rescore(IndexSearcher searcher, TopDocs firstPassTopDocs, int topN)
46+
throws IOException {
47+
DoubleValuesSource source = valuesSource.rewrite(searcher);
48+
// this will still alter scores, we clone to retain hits ordering in firstPassTopDocs
49+
ScoreDoc[] hits = firstPassTopDocs.scoreDocs.clone();
50+
Arrays.sort(hits, (a, b) -> a.doc - b.doc);
51+
52+
List<LeafReaderContext> leaves = searcher.getIndexReader().leaves();
53+
LeafReaderContext ctx = leaves.getFirst();
54+
int currLeaf = 0;
55+
int leafEnd = ctx.docBase + ctx.reader().maxDoc();
56+
57+
// find leaf holding this doc
58+
for (ScoreDoc hit : hits) {
59+
while (hit.doc >= leafEnd) {
60+
if (currLeaf == leaves.size() - 1) {
61+
throw new IllegalStateException(
62+
"hit docId="
63+
+ hit.doc
64+
+ "greater than last searcher leaf maxDoc="
65+
+ leafEnd
66+
+ " Ensure firstPassTopDocs were produced by the searcher provided to rescore.");
67+
}
68+
ctx = leaves.get(++currLeaf);
69+
leafEnd = ctx.docBase + ctx.reader().maxDoc();
70+
}
71+
72+
int targetDoc = hit.doc - ctx.docBase;
73+
DoubleValues values = source.getValues(ctx, null);
74+
boolean scorePresent = values.advanceExact(targetDoc);
75+
double secondPassScore = scorePresent ? values.doubleValue() : 0.0f;
76+
hit.score = combine(hit.score, scorePresent, secondPassScore);
77+
}
78+
79+
if (topN < hits.length) {
80+
ArrayUtil.select(hits, 0, hits.length, topN, ScoreDoc.COMPARATOR);
81+
ScoreDoc[] subset = new ScoreDoc[topN];
82+
System.arraycopy(hits, 0, subset, 0, topN);
83+
hits = subset;
84+
}
85+
Arrays.sort(hits, ScoreDoc.COMPARATOR);
86+
87+
return new TopDocs(firstPassTopDocs.totalHits, hits);
88+
}
89+
90+
@Override
91+
public Explanation explain(IndexSearcher searcher, Explanation firstPassExplanation, int docID)
92+
throws IOException {
93+
Explanation first =
94+
Explanation.match(
95+
firstPassExplanation.getValue(), "first pass score", firstPassExplanation);
96+
97+
LeafReaderContext leafWithDoc = null;
98+
for (LeafReaderContext ctx : searcher.getIndexReader().leaves()) {
99+
if (docID >= ctx.docBase && docID < (ctx.docBase + ctx.reader().maxDoc())) {
100+
leafWithDoc = ctx;
101+
break;
102+
}
103+
}
104+
if (leafWithDoc == null) {
105+
throw new IllegalArgumentException(
106+
"docId=" + docID + " not found in any leaf in provided searcher");
107+
}
108+
109+
DoubleValuesSource source = valuesSource.rewrite(searcher);
110+
Explanation doubleValuesMatch =
111+
source.explain(
112+
leafWithDoc,
113+
docID - leafWithDoc.docBase,
114+
Explanation.noMatch("DoubleValuesSource was not initialized with query scores"));
115+
Explanation second =
116+
doubleValuesMatch.isMatch()
117+
? Explanation.match(
118+
doubleValuesMatch.getValue(), "value from DoubleValuesSource", doubleValuesMatch)
119+
: Explanation.noMatch("no value in DoubleValuesSource");
120+
121+
float score =
122+
combine(
123+
first.getValue().floatValue(),
124+
doubleValuesMatch.isMatch(),
125+
doubleValuesMatch.getValue().doubleValue());
126+
String desc =
127+
"combined score from firstPass and DoubleValuesSource="
128+
+ source.getClass()
129+
+ " using "
130+
+ getClass();
131+
return Explanation.match(score, desc, first, second);
132+
}
133+
}

lucene/core/src/java/org/apache/lucene/search/ScoreDoc.java

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
*/
1717
package org.apache.lucene.search;
1818

19+
import java.util.Comparator;
1920
import org.apache.lucene.index.StoredFields;
2021

2122
/** Holds one hit in {@link TopDocs}. */
@@ -51,4 +52,16 @@ public ScoreDoc(int doc, float score, int shardIndex) {
5152
public String toString() {
5253
return "doc=" + doc + " score=" + score + " shardIndex=" + shardIndex;
5354
}
55+
56+
/** Utility comparator that sorts by score descending, then by docId ascending */
57+
public static final Comparator<ScoreDoc> COMPARATOR =
58+
(a, b) -> {
59+
if (a.score > b.score) {
60+
return -1;
61+
} else if (a.score < b.score) {
62+
return 1;
63+
} else {
64+
return a.doc - b.doc;
65+
}
66+
};
5467
}
Lines changed: 161 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,161 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
package org.apache.lucene.search;
18+
19+
import java.util.Arrays;
20+
import java.util.List;
21+
import org.apache.lucene.document.Document;
22+
import org.apache.lucene.document.Field;
23+
import org.apache.lucene.document.NumericDocValuesField;
24+
import org.apache.lucene.index.DirectoryReader;
25+
import org.apache.lucene.index.IndexReader;
26+
import org.apache.lucene.index.Term;
27+
import org.apache.lucene.store.Directory;
28+
import org.apache.lucene.tests.index.RandomIndexWriter;
29+
import org.apache.lucene.tests.util.LuceneTestCase;
30+
31+
public class TestDoubleValuesSourceRescorer extends LuceneTestCase {
32+
33+
private static final String ID_FIELD = "id";
34+
private static final String DOC_VAL_FIELD = "docVal";
35+
private static final String DOC_VAL_STORED_FIELD = "storedDocVal";
36+
37+
private final DoubleValuesSource doubleValuesSource =
38+
DoubleValuesSource.fromIntField(DOC_VAL_FIELD);
39+
40+
private final DoubleValuesSourceRescorer rescorer =
41+
new DoubleValuesSourceRescorer(doubleValuesSource) {
42+
@Override
43+
protected float combine(float firstPassScore, boolean valuePresent, double sourceValue) {
44+
return valuePresent ? (float) sourceValue : 0f;
45+
}
46+
};
47+
48+
private static final List<String> dictionary =
49+
Arrays.asList(
50+
"river", "quick", "brown", "fox", "jumped", "lazy", "fence", "wizard", "of", "a", "an",
51+
"the", "cookie", "golf", "golden", "tennis", "boy", "plays", "likes", "wants");
52+
53+
String randomSentence() {
54+
final int length = random().nextInt(3, 10);
55+
StringBuilder sentence = new StringBuilder();
56+
for (int i = 0; i < length; i++) {
57+
sentence.append(dictionary.get(random().nextInt(dictionary.size() - 1)) + " ");
58+
}
59+
return sentence.toString();
60+
}
61+
62+
private void publishDocs(int numDocs, String fieldName, boolean indexDocValues, Directory dir)
63+
throws Exception {
64+
RandomIndexWriter w = new RandomIndexWriter(random(), dir, newIndexWriterConfig());
65+
for (int i = 0; i < numDocs; i++) {
66+
Document d = new Document();
67+
d.add(newStringField(ID_FIELD, Integer.toString(i), Field.Store.YES));
68+
d.add(newTextField(fieldName, randomSentence(), Field.Store.NO));
69+
if (indexDocValues) {
70+
int val = i + 100;
71+
d.add(new NumericDocValuesField(DOC_VAL_FIELD, val));
72+
d.add(newStringField(DOC_VAL_STORED_FIELD, Integer.toString(val), Field.Store.YES));
73+
}
74+
w.addDocument(d);
75+
}
76+
w.close();
77+
}
78+
79+
public void testBasic() throws Exception {
80+
try (Directory dir = newDirectory()) {
81+
publishDocs(random().nextInt(100), "title", true, dir);
82+
try (IndexReader r = DirectoryReader.open(dir)) {
83+
IndexSearcher s = new IndexSearcher(r);
84+
TermQuery query =
85+
new TermQuery(
86+
new Term("title", dictionary.get(random().nextInt(dictionary.size() - 1))));
87+
TopDocs queryHits = s.search(query, 50);
88+
TopDocs rescoredHits = rescorer.rescore(s, queryHits, 15);
89+
assertTrue(rescoredHits.scoreDocs.length <= 15);
90+
assertEquals(queryHits.totalHits, rescoredHits.totalHits);
91+
for (int i = 1; i < rescoredHits.scoreDocs.length; i++) {
92+
assertTrue(rescoredHits.scoreDocs[i - 1].score > rescoredHits.scoreDocs[i].score);
93+
}
94+
for (ScoreDoc hit : rescoredHits.scoreDocs) {
95+
assertEquals(
96+
s.storedFields().document(hit.doc).get(DOC_VAL_STORED_FIELD),
97+
Integer.toString((int) hit.score));
98+
}
99+
int doc = rescoredHits.scoreDocs[0].doc;
100+
Explanation e = rescorer.explain(s, s.explain(query, doc), doc);
101+
String msg = e.toString();
102+
assertTrue(msg.contains("combined score from firstPass and DoubleValuesSource"));
103+
assertTrue(msg.contains(getClass().toString()));
104+
assertTrue(msg.contains("first pass score"));
105+
assertTrue(msg.contains("value from DoubleValuesSource"));
106+
}
107+
}
108+
}
109+
110+
public void testSubsetAndIdempotency() throws Exception {
111+
try (Directory dir = newDirectory()) {
112+
publishDocs(random().nextInt(60, 200), "title", true, dir);
113+
try (IndexReader r = DirectoryReader.open(dir)) {
114+
IndexSearcher s = new IndexSearcher(r);
115+
TermQuery query =
116+
new TermQuery(
117+
new Term("title", dictionary.get(random().nextInt(dictionary.size() - 1))));
118+
TopDocs queryHits = s.search(query, 50);
119+
TopDocs rescoredHits1 = rescorer.rescore(s, queryHits, 15);
120+
121+
int hits1Len = rescoredHits1.scoreDocs.length;
122+
int hit2N = Math.max(hits1Len / 2, 1);
123+
TopDocs rescoredHits2 = rescorer.rescore(s, queryHits, hit2N);
124+
assertEquals(hit2N, rescoredHits2.scoreDocs.length);
125+
for (int i = 0; i < hit2N; i++) {
126+
assertEquals(rescoredHits1.scoreDocs[i].doc, rescoredHits2.scoreDocs[i].doc);
127+
assertEquals(rescoredHits1.scoreDocs[i].score, rescoredHits2.scoreDocs[i].score, 1e-5);
128+
}
129+
}
130+
}
131+
}
132+
133+
public void testMissingValues() throws Exception {
134+
try (Directory dir = newDirectory()) {
135+
publishDocs(random().nextInt(60, 200), "title", false, dir);
136+
try (IndexReader r = DirectoryReader.open(dir)) {
137+
IndexSearcher s = new IndexSearcher(r);
138+
TermQuery query =
139+
new TermQuery(
140+
new Term("title", dictionary.get(random().nextInt(dictionary.size() - 1))));
141+
TopDocs queryHits = s.search(query, 50);
142+
TopDocs rescoredHits = rescorer.rescore(s, queryHits, 15);
143+
assertTrue(rescoredHits.scoreDocs.length <= 15);
144+
assertEquals(queryHits.totalHits, rescoredHits.totalHits);
145+
for (int i = 0; i < rescoredHits.scoreDocs.length; i++) {
146+
assertEquals(rescoredHits.scoreDocs[i].score, 0f, 1e-5);
147+
if (i > 0) {
148+
assertTrue(rescoredHits.scoreDocs[i - 1].doc < rescoredHits.scoreDocs[i].doc);
149+
}
150+
}
151+
int doc = rescoredHits.scoreDocs[0].doc;
152+
Explanation e = rescorer.explain(s, s.explain(query, doc), doc);
153+
String msg = e.toString();
154+
assertTrue(msg.contains("combined score from firstPass and DoubleValuesSource"));
155+
assertTrue(msg.contains(getClass().toString()));
156+
assertTrue(msg.contains("first pass score"));
157+
assertTrue(msg.contains("no value in DoubleValuesSource"));
158+
}
159+
}
160+
}
161+
}

0 commit comments

Comments
 (0)