-
Notifications
You must be signed in to change notification settings - Fork 1.2k
Move HitQueue in TopScoreDocCollector to a LongHeap #14714
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 22 commits
7f0ed2a
487bbc8
6ad87d1
b55336a
ab6773d
b53ef5f
8ec9930
8b25eb3
1af4d1b
6c7c2eb
ac598df
212d73d
505e0ab
8d129ce
dea53e8
de56623
d1ac4b1
270f63e
5f94725
b729ea7
776e6e8
e650ac4
2ed31cd
4b4878b
8f1abc6
784058d
a1d7699
ee1b72d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,81 @@ | ||
| /* | ||
| * Licensed to the Apache Software Foundation (ASF) under one or more | ||
| * contributor license agreements. See the NOTICE file distributed with | ||
| * this work for additional information regarding copyright ownership. | ||
| * The ASF licenses this file to You under the Apache License, Version 2.0 | ||
| * (the "License"); you may not use this file except in compliance with | ||
| * the License. You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, software | ||
| * distributed under the License is distributed on an "AS IS" BASIS, | ||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| * See the License for the specific language governing permissions and | ||
| * limitations under the License. | ||
| */ | ||
|
|
||
| package org.apache.lucene.search; | ||
|
|
||
| import org.apache.lucene.util.NumericUtils; | ||
|
|
||
| /** | ||
| * An encoder do encode (doc, score) pair as a long whose sort order is same as {@code (o1, o2) -> | ||
| * Float.compare(o1.score, o2.score)).thenComparing(Comparator.comparingInt((ScoreDoc o) -> | ||
| * o.doc).reversed())} | ||
| * | ||
| * <p>Note that negative score is allowed but relationship between two codes encoded by negative | ||
| * scores is undefined. The only thing guaranteed is codes encoded from negative scores are smaller | ||
| * than codes encoded from non-negative scores. | ||
| */ | ||
| class DocScoreEncoder { | ||
|
|
||
| static final long LEAST_COMPETITIVE_CODE = encode(Integer.MAX_VALUE, Float.NEGATIVE_INFINITY); | ||
| private static final int POS_INF_TO_SORTABLE_INT = scoreToSortableInt(Float.POSITIVE_INFINITY); | ||
|
|
||
| static long encode(int docId, float score) { | ||
| return encodeIntScore(docId, scoreToSortableInt(score)); | ||
| } | ||
|
|
||
| static long encodeIntScore(int docId, int score) { | ||
| return (((long) score) << 32) | (~docId & 0xFFFFFFFFL); | ||
| } | ||
|
|
||
| static float toScore(long value) { | ||
| return sortableIntToScore(toIntScore(value)); | ||
| } | ||
|
|
||
| static int toIntScore(long value) { | ||
| return (int) (value >>> 32); | ||
| } | ||
|
|
||
| static int docId(long value) { | ||
| return (int) ~value; | ||
| } | ||
|
|
||
| static int nextUp(int intScore) { | ||
| assert intScore <= POS_INF_TO_SORTABLE_INT; | ||
| int nextUp = Math.min(POS_INF_TO_SORTABLE_INT, intScore + 1); | ||
| assert nextUp == scoreToSortableInt(Math.nextUp(sortableIntToScore(intScore))); | ||
| return nextUp; | ||
| } | ||
|
|
||
| /** | ||
| * Score is non-negative float so wo use floatToRawIntBits instead of {@link | ||
| * NumericUtils#floatToSortableInt}. We do not assert score >= 0 here to allow pass negative float | ||
| * to indicate totally non-competitive, e.g. {@link #LEAST_COMPETITIVE_CODE}. | ||
| */ | ||
| static int scoreToSortableInt(float score) { | ||
| assert Float.isNaN(score) == false; | ||
| return Float.floatToRawIntBits(score); | ||
| } | ||
|
|
||
| /** | ||
| * @see #scoreToSortableInt(float) | ||
| */ | ||
| static float sortableIntToScore(int scoreBits) { | ||
| float score = Float.intBitsToFloat(scoreBits); | ||
| assert Float.isNaN(score) == false; | ||
| return score; | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -17,7 +17,9 @@ | |
| package org.apache.lucene.search; | ||
|
|
||
| import java.io.IOException; | ||
| import java.util.stream.IntStream; | ||
| import org.apache.lucene.index.LeafReaderContext; | ||
| import org.apache.lucene.util.LongHeap; | ||
|
|
||
| /** | ||
| * A {@link Collector} implementation that collects the top-scoring hits, returning them as a {@link | ||
|
|
@@ -32,31 +34,21 @@ | |
| public class TopScoreDocCollector extends TopDocsCollector<ScoreDoc> { | ||
|
|
||
| private final ScoreDoc after; | ||
| private final LongHeap heap; | ||
| final int totalHitsThreshold; | ||
| final MaxScoreAccumulator minScoreAcc; | ||
|
|
||
| // prevents instantiation | ||
| TopScoreDocCollector( | ||
| int numHits, ScoreDoc after, int totalHitsThreshold, MaxScoreAccumulator minScoreAcc) { | ||
| super(new HitQueue(numHits, true)); | ||
| super(null); | ||
| this.heap = new LongHeap(numHits); | ||
| IntStream.range(0, numHits).forEach(_ -> heap.push(DocScoreEncoder.LEAST_COMPETITIVE_CODE)); | ||
|
||
| this.after = after; | ||
| this.totalHitsThreshold = totalHitsThreshold; | ||
| this.minScoreAcc = minScoreAcc; | ||
| } | ||
|
|
||
| @Override | ||
| protected int topDocsSize() { | ||
| // Note: this relies on sentinel values having Integer.MAX_VALUE as a doc ID. | ||
| int[] validTopHitCount = new int[1]; | ||
| pq.forEach( | ||
| scoreDoc -> { | ||
| if (scoreDoc.doc != Integer.MAX_VALUE) { | ||
| validTopHitCount[0]++; | ||
| } | ||
| }); | ||
| return validTopHitCount[0]; | ||
| } | ||
|
|
||
| @Override | ||
| protected TopDocs newTopDocs(ScoreDoc[] results, int start) { | ||
| return results == null | ||
|
|
@@ -73,23 +65,22 @@ public ScoreMode scoreMode() { | |
| public LeafCollector getLeafCollector(LeafReaderContext context) throws IOException { | ||
| final int docBase = context.docBase; | ||
| final ScoreDoc after = this.after; | ||
| final float afterScore; | ||
| final int afterScore; | ||
|
||
| final int afterDoc; | ||
| if (after == null) { | ||
| afterScore = Float.POSITIVE_INFINITY; | ||
| afterScore = Integer.MAX_VALUE; | ||
| afterDoc = DocIdSetIterator.NO_MORE_DOCS; | ||
| } else { | ||
| afterScore = after.score; | ||
| afterScore = DocScoreEncoder.scoreToSortableInt(after.score); | ||
| afterDoc = after.doc - context.docBase; | ||
| } | ||
|
|
||
| return new LeafCollector() { | ||
|
|
||
| private Scorable scorer; | ||
| // HitQueue implements getSentinelObject to return a ScoreDoc, so we know | ||
| // that at this point top() is already initialized. | ||
| private ScoreDoc pqTop = pq.top(); | ||
| private float minCompetitiveScore; | ||
| private long topCode = heap.top(); | ||
| private int topScore = DocScoreEncoder.toIntScore(topCode); | ||
| private int minCompetitiveScore; | ||
|
|
||
| @Override | ||
| public void setScorer(Scorable scorer) throws IOException { | ||
|
|
@@ -103,7 +94,7 @@ public void setScorer(Scorable scorer) throws IOException { | |
|
|
||
| @Override | ||
| public void collect(int doc) throws IOException { | ||
| float score = scorer.score(); | ||
| final int score = DocScoreEncoder.scoreToSortableInt(scorer.score()); | ||
|
|
||
| int hitCountSoFar = ++totalHits; | ||
|
|
||
|
|
@@ -121,7 +112,7 @@ public void collect(int doc) throws IOException { | |
| return; | ||
| } | ||
|
|
||
| if (score <= pqTop.score) { | ||
| if (score <= topScore) { | ||
| // Note: for queries that match lots of hits, this is the common case: most hits are not | ||
| // competitive. | ||
| if (hitCountSoFar == totalHitsThreshold + 1) { | ||
|
|
@@ -138,10 +129,10 @@ public void collect(int doc) throws IOException { | |
| } | ||
| } | ||
|
|
||
| private void collectCompetitiveHit(int doc, float score) throws IOException { | ||
| pqTop.doc = doc + docBase; | ||
| pqTop.score = score; | ||
| pqTop = pq.updateTop(); | ||
| private void collectCompetitiveHit(int doc, int score) throws IOException { | ||
| final long code = DocScoreEncoder.encodeIntScore(doc + docBase, score); | ||
| topCode = heap.updateTop(code); | ||
| topScore = DocScoreEncoder.toIntScore(topCode); | ||
| updateMinCompetitiveScore(scorer); | ||
| } | ||
|
|
||
|
|
@@ -152,10 +143,11 @@ private void updateGlobalMinCompetitiveScore(Scorable scorer) throws IOException | |
| // since we tie-break on doc id and collect in doc id order we can require | ||
| // the next float if the global minimum score is set on a document id that is | ||
| // smaller than the ids in the current leaf | ||
| float score = MaxScoreAccumulator.toScore(maxMinScore); | ||
| score = docBase >= MaxScoreAccumulator.docId(maxMinScore) ? Math.nextUp(score) : score; | ||
| int score = DocScoreEncoder.toIntScore(maxMinScore); | ||
| score = | ||
| docBase >= DocScoreEncoder.docId(maxMinScore) ? DocScoreEncoder.nextUp(score) : score; | ||
| if (score > minCompetitiveScore) { | ||
| scorer.setMinCompetitiveScore(score); | ||
| scorer.setMinCompetitiveScore(DocScoreEncoder.sortableIntToScore(score)); | ||
| minCompetitiveScore = score; | ||
| totalHitsRelation = TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO; | ||
| } | ||
|
|
@@ -164,23 +156,44 @@ private void updateGlobalMinCompetitiveScore(Scorable scorer) throws IOException | |
|
|
||
| private void updateMinCompetitiveScore(Scorable scorer) throws IOException { | ||
| if (totalHits > totalHitsThreshold) { | ||
| // since we tie-break on doc id and collect in doc id order, we can require the next float | ||
| // pqTop is never null since TopScoreDocCollector fills the priority queue with sentinel | ||
| // values if the top element is a sentinel value, its score will be -Infty and the below | ||
| // logic is still valid | ||
| float localMinScore = Math.nextUp(pqTop.score); | ||
| if (localMinScore > minCompetitiveScore) { | ||
| scorer.setMinCompetitiveScore(localMinScore); | ||
| if (topScore >= minCompetitiveScore) { | ||
| minCompetitiveScore = DocScoreEncoder.nextUp(topScore); | ||
| scorer.setMinCompetitiveScore(DocScoreEncoder.sortableIntToScore(minCompetitiveScore)); | ||
| totalHitsRelation = TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO; | ||
| minCompetitiveScore = localMinScore; | ||
| if (minScoreAcc != null) { | ||
| // we don't use the next float but we register the document id so that other leaves or | ||
| // leaf partitions can require it if they are after the current maximum | ||
| minScoreAcc.accumulate(pqTop.doc, pqTop.score); | ||
| minScoreAcc.accumulate(topCode); | ||
| } | ||
| } | ||
| } | ||
| } | ||
| }; | ||
| } | ||
|
|
||
| @Override | ||
| protected int topDocsSize() { | ||
| int cnt = 0; | ||
| for (int i = 1; i <= heap.size(); i++) { | ||
| if (heap.get(i) != DocScoreEncoder.LEAST_COMPETITIVE_CODE) { | ||
| cnt++; | ||
| } | ||
| } | ||
| return cnt; | ||
| } | ||
|
|
||
| @Override | ||
| protected void populateResults(ScoreDoc[] results, int howMany) { | ||
| for (int i = howMany - 1; i >= 0; i--) { | ||
| long encode = heap.pop(); | ||
| results[i] = new ScoreDoc(DocScoreEncoder.docId(encode), DocScoreEncoder.toScore(encode)); | ||
| } | ||
| } | ||
|
|
||
| @Override | ||
| protected void pruneLeastCompetitiveHitsTo(int keep) { | ||
| for (int i = heap.size() - keep; i > 0; i--) { | ||
| heap.pop(); | ||
| } | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is a bit too subtle to my taste, could we either not have to deal with negative scores at all, or use NumericUtils#floatToSortableInt? FWIW, I believe that
LEAST_COMPETITIVE_CODEcould use a score of 0 since Integer.MAX_VALUE is not an allowed doc ID?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The issue of using
encode(Integer.MAX_VALUE , 0f)is that topScore will be decoded as 0lucene/lucene/core/src/java/org/apache/lucene/search/TopScoreDocCollector.java
Line 135 in e650ac4
Then score 0 will not be competitive as we are comparing score only.
lucene/lucene/core/src/java/org/apache/lucene/search/TopScoreDocCollector.java
Line 115 in e650ac4
I agree this contract is bit tricky, maybe we should just use
NumericUtils#floatToSortableInt.