Skip to content

Commit b5e79a3

Browse files
authored
Refactor main top-n bulk scorers to evaluate hits in a more term-at-a-time fashion. (#14701)
`MaxScoreBulkScorer` and `BlockMaxConjunctionBulkScorer` currently evaluate hits in a doc-at-a-time (DAAT) fashion, meaning that they they look at all their clauses to find the next doc and so forth until all docs from the window are evaluated. This changes evaluation to run in a more term-at-a-time fashion (TAAT) within scoring windows, meaning that each clause is fully evaluated within the window before moving on to the next clause. Note that this isn't completely new, `BooleanScorer` has been doing this to exhaustively evaluate disjunctive queries, by loading their matches into a bit set, one clause at a time. Also note that this is a bit different from traditional TAAT as this is scoped to small-ish windows of doc IDs, not the entire doc ID space. This in-turn allows these scorers to take advantage of the new `Scorer#nextDocsAndScores` API, and provides a good speedup. A downside is that we may need to perform more memory copying in some cases, and evaluate a bit more documents, but the change still looks like a win in general.
1 parent 41abd7a commit b5e79a3

File tree

5 files changed

+233
-192
lines changed

5 files changed

+233
-192
lines changed

lucene/CHANGES.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,9 @@ Optimizations
119119
* GITHUB#14700: Return MatchNoDocsQuery when IndexOrDocValuesQuery::rewrite does not match
120120
(Chris Hegarty)
121121

122+
* GITHUB#14701: Optimize top-n bulk scorers by evaluating scoring windows in a
123+
term-at-a-time fashion instead of doc-at-a-time. (Adrien Grand)
124+
122125
Bug Fixes
123126
---------------------
124127
* GITHUB#14654: ValueSource.fromDoubleValuesSource(dvs).getSortField() would throw errors when

lucene/core/src/java/org/apache/lucene/search/BlockMaxConjunctionBulkScorer.java

Lines changed: 33 additions & 100 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@
2222
import java.util.List;
2323
import org.apache.lucene.search.Weight.DefaultBulkScorer;
2424
import org.apache.lucene.util.Bits;
25-
import org.apache.lucene.util.MathUtil;
2625

2726
/**
2827
* BulkScorer implementation of {@link BlockMaxConjunctionScorer} that focuses on top-level
@@ -38,11 +37,12 @@ final class BlockMaxConjunctionBulkScorer extends BulkScorer {
3837
private final Scorer[] scorers;
3938
private final Scorable[] scorables;
4039
private final DocIdSetIterator[] iterators;
41-
private final DocIdSetIterator lead1, lead2;
42-
private final Scorable scorer1, scorer2;
40+
private final DocIdSetIterator lead;
4341
private final DocAndScore scorable = new DocAndScore();
4442
private final double[] sumOfOtherClauses;
4543
private final int maxDoc;
44+
private final DocAndScoreBuffer docAndScoreBuffer = new DocAndScoreBuffer();
45+
private final DocAndScoreAccBuffer docAndScoreAccBuffer = new DocAndScoreAccBuffer();
4646

4747
BlockMaxConjunctionBulkScorer(int maxDoc, List<Scorer> scorers) throws IOException {
4848
if (scorers.size() <= 1) {
@@ -54,14 +54,9 @@ final class BlockMaxConjunctionBulkScorer extends BulkScorer {
5454
Arrays.stream(this.scorers).map(ScorerUtil::likelyTermScorer).toArray(Scorable[]::new);
5555
this.iterators =
5656
Arrays.stream(this.scorers).map(Scorer::iterator).toArray(DocIdSetIterator[]::new);
57-
lead1 = ScorerUtil.likelyImpactsEnum(iterators[0]);
58-
lead2 = ScorerUtil.likelyImpactsEnum(iterators[1]);
59-
scorer1 = this.scorables[0];
60-
scorer2 = this.scorables[1];
57+
lead = ScorerUtil.likelyImpactsEnum(iterators[0]);
6158
this.sumOfOtherClauses = new double[this.scorers.length];
62-
for (int i = 0; i < sumOfOtherClauses.length; i++) {
63-
sumOfOtherClauses[i] = Double.POSITIVE_INFINITY;
64-
}
59+
Arrays.fill(sumOfOtherClauses, Double.POSITIVE_INFINITY);
6560
this.maxDoc = maxDoc;
6661
}
6762

@@ -86,7 +81,7 @@ private float computeMaxScore(int windowMin, int windowMax) throws IOException {
8681
public int score(LeafCollector collector, Bits acceptDocs, int min, int max) throws IOException {
8782
collector.setScorer(scorable);
8883

89-
int windowMin = Math.max(lead1.docID(), min);
84+
int windowMin = Math.max(lead.docID(), min);
9085
while (windowMin < max) {
9186
// Use impacts of the least costly scorer to compute windows
9287
// NOTE: windowMax is inclusive
@@ -97,7 +92,7 @@ public int score(LeafCollector collector, Bits acceptDocs, int min, int max) thr
9792
maxWindowScore = computeMaxScore(windowMin, windowMax);
9893
}
9994
scoreWindow(collector, acceptDocs, windowMin, windowMax + 1, maxWindowScore);
100-
windowMin = Math.max(lead1.docID(), windowMax + 1);
95+
windowMin = Math.max(lead.docID(), windowMax + 1);
10196
}
10297

10398
return windowMin >= maxDoc ? DocIdSetIterator.NO_MORE_DOCS : windowMin;
@@ -111,111 +106,49 @@ private void scoreWindow(
111106
return;
112107
}
113108

114-
if (lead1.docID() < min) {
115-
lead1.advance(min);
109+
if (lead.docID() < min) {
110+
lead.advance(min);
116111
}
117-
if (lead1.docID() >= max) {
112+
if (lead.docID() >= max) {
118113
return;
119114
}
120115

121-
Scorable scorer1 = this.scorer1;
122-
if (scorers[0].getMaxScore(max - 1) == 0f) {
123-
// Null out scorer1 if it may only produce 0 scores over this window. In practice, this is
124-
// mostly useful because FILTER clauses are pushed as constant-scoring MUST clauses with a
125-
// 0 score to this scorer. Setting it to null instead of using a different impl helps
126-
// reduce polymorphism of calls to Scorable#score and skip the check of whether the leading
127-
// clause produced a high-enough score for the doc to be competitive.
128-
scorer1 = null;
129-
}
116+
for (scorers[0].nextDocsAndScores(max, acceptDocs, docAndScoreBuffer);
117+
docAndScoreBuffer.size > 0;
118+
scorers[0].nextDocsAndScores(max, acceptDocs, docAndScoreBuffer)) {
130119

131-
final double sumOfOtherMaxScoresAt1 = sumOfOtherClauses[1];
120+
docAndScoreAccBuffer.copyFrom(docAndScoreBuffer);
132121

133-
advanceHead:
134-
for (int doc = lead1.docID(); doc < max; ) {
135-
if (acceptDocs != null && acceptDocs.get(doc) == false) {
136-
doc = lead1.nextDoc();
137-
continue;
138-
}
139-
140-
// Compute the score as we find more matching clauses, in order to skip advancing other
141-
// clauses if the total score has no chance of being competitive. This works well because
142-
// computing a score is usually cheaper than decoding a full block of postings and
143-
// frequencies.
144-
final boolean hasMinCompetitiveScore = scorable.minCompetitiveScore > 0;
145-
double currentScore;
146-
if (scorer1 != null && hasMinCompetitiveScore) {
147-
currentScore = scorer1.score();
148-
149-
// This is the same logic as in the below for loop, specialized for the 2nd least costly
150-
// clause. This seems to help the JVM.
151-
152-
// First check if we have a chance of having a match based on max scores
153-
if ((float) MathUtil.sumUpperBound(currentScore + sumOfOtherMaxScoresAt1, scorers.length)
154-
< scorable.minCompetitiveScore) {
155-
doc = lead1.nextDoc();
156-
continue advanceHead;
122+
for (int i = 1; i < scorers.length; ++i) {
123+
if (scorable.minCompetitiveScore > 0) {
124+
ScorerUtil.filterCompetitiveHits(
125+
docAndScoreAccBuffer,
126+
sumOfOtherClauses[i],
127+
scorable.minCompetitiveScore,
128+
scorers.length);
157129
}
158-
} else {
159-
currentScore = 0;
160-
}
161130

162-
// NOTE: lead2 may be on `doc` already if we `continue`d on the previous loop iteration.
163-
if (lead2.docID() < doc) {
164-
int next = lead2.advance(doc);
165-
if (next != doc) {
166-
doc = lead1.advance(next);
167-
continue advanceHead;
168-
}
169-
}
170-
assert lead2.docID() == doc;
171-
if (hasMinCompetitiveScore) {
172-
currentScore += scorer2.score();
131+
ScorerUtil.applyRequiredClause(docAndScoreAccBuffer, iterators[i], scorables[i]);
173132
}
174133

175-
for (int i = 2; i < iterators.length; ++i) {
176-
// First check if we have a chance of having a match based on max scores
177-
if (hasMinCompetitiveScore
178-
&& (float) MathUtil.sumUpperBound(currentScore + sumOfOtherClauses[i], scorers.length)
179-
< scorable.minCompetitiveScore) {
180-
doc = lead1.nextDoc();
181-
continue advanceHead;
182-
}
183-
184-
// NOTE: these iterators may be on `doc` already if we called `continue advanceHead` on the
185-
// previous loop iteration.
186-
if (iterators[i].docID() < doc) {
187-
int next = iterators[i].advance(doc);
188-
if (next != doc) {
189-
doc = lead1.advance(next);
190-
continue advanceHead;
191-
}
192-
}
193-
assert iterators[i].docID() == doc;
194-
if (hasMinCompetitiveScore) {
195-
currentScore += scorables[i].score();
196-
}
197-
}
198-
199-
if (hasMinCompetitiveScore == false) {
200-
for (Scorable scorer : scorables) {
201-
currentScore += scorer.score();
202-
}
203-
}
204-
scorable.score = (float) currentScore;
205-
collector.collect(doc);
206-
// The collect() call may have updated the minimum competitive score.
207-
if (maxWindowScore < scorable.minCompetitiveScore) {
208-
// no more hits are competitive
209-
return;
134+
for (int i = 0; i < docAndScoreAccBuffer.size; ++i) {
135+
scorable.score = (float) docAndScoreAccBuffer.scores[i];
136+
collector.collect(docAndScoreAccBuffer.docs[i]);
210137
}
138+
}
211139

212-
doc = lead1.nextDoc();
140+
int maxOtherDoc = -1;
141+
for (int i = 1; i < iterators.length; ++i) {
142+
maxOtherDoc = Math.max(iterators[i].docID(), maxOtherDoc);
143+
}
144+
if (lead.docID() < maxOtherDoc) {
145+
lead.advance(maxOtherDoc);
213146
}
214147
}
215148

216149
@Override
217150
public long cost() {
218-
return lead1.cost();
151+
return lead.cost();
219152
}
220153

221154
private static class DocAndScore extends Scorable {
Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
package org.apache.lucene.search;
18+
19+
import org.apache.lucene.util.ArrayUtil;
20+
import org.apache.lucene.util.IntsRef;
21+
22+
/**
23+
* Wrapper around parallel arrays storing doc IDs and their corresponding score accumulators.
24+
*
25+
* @lucene.internal
26+
*/
27+
public final class DocAndScoreAccBuffer {
28+
29+
private static final double[] EMPTY_DOUBLES = new double[0];
30+
31+
/** Doc IDs */
32+
public int[] docs = IntsRef.EMPTY_INTS;
33+
34+
/** Scores */
35+
public double[] scores = EMPTY_DOUBLES;
36+
37+
/** Number of valid entries in the doc ID and score arrays. */
38+
public int size;
39+
40+
/** Sole constructor. */
41+
public DocAndScoreAccBuffer() {}
42+
43+
/**
44+
* Grow both arrays to ensure that they can store at least the given number of entries. Existing
45+
* content may be discarded.
46+
*/
47+
public void growNoCopy(int minSize) {
48+
if (docs.length < minSize) {
49+
docs = ArrayUtil.growNoCopy(docs, minSize);
50+
scores = new double[docs.length];
51+
}
52+
}
53+
54+
/**
55+
* Grow both arrays to ensure that they can store at least the given number of entries. Existing
56+
* content is preserved.
57+
*/
58+
public void grow(int minSize) {
59+
if (docs.length < minSize) {
60+
docs = ArrayUtil.grow(docs, minSize);
61+
scores = ArrayUtil.growExact(scores, docs.length);
62+
}
63+
}
64+
65+
/** Copy content from the given {@link DocAndScoreBuffer}, expanding float scores to doubles. */
66+
public void copyFrom(DocAndScoreBuffer buffer) {
67+
growNoCopy(buffer.size);
68+
System.arraycopy(buffer.docs, 0, docs, 0, buffer.size);
69+
for (int i = 0; i < buffer.size; ++i) {
70+
scores[i] = buffer.scores[i];
71+
}
72+
this.size = buffer.size;
73+
}
74+
}

0 commit comments

Comments
 (0)