Skip to content

Commit bea2f67

Browse files
authored
Use a more coarse-grained competitive iterator for skipper-based numeric sorts (#15632)
Numeric sorts against a field with DocValuesSkippers enabled currently use DocValuesRangeIterator to implement competitive iterators. This has a couple of disadvantages: - DVRI cannot efficiently implement docIDRunEnd() or intoBitSet(), meaning that bulk conjunction filtering may end up falling into slower code paths - For field value distributions that are essentially random, DVRI falls back to doc-by-doc value checking, meaning that no skipping happens at all, but adding overhead. This commit adds a new SkipBlockRangeIterator that only skips whole blocks where no document will be competitive, avoiding any individual doc-by-doc value checks. The docIDRunEnd() and intoBitSet() implementations are very fast and mean that bulk conjunction filtering will be efficient. The overheads as a whole are very low, so randomly distributed values are much less adversarial, while queries against indexes where the document order is roughly correlated with the query sort get significant boosts.
1 parent 5bf2835 commit bea2f67

File tree

8 files changed

+360
-194
lines changed

8 files changed

+360
-194
lines changed

lucene/CHANGES.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -336,6 +336,9 @@ Optimizations
336336

337337
* GITHUB#15585: Added shared prefetch counter to maintain count across clones and slices of MemorySegmentIndexInput. (Shubham Sharma)
338338

339+
* GITHUB#15632: Use a coarser-grained competitive iterator with lower construction costs for
340+
numeric sorts against fields with DocValuesSkippers. (Alan Woodward)
341+
339342
Bug Fixes
340343
---------------------
341344
* GITHUB#14161: PointInSetQuery's constructor now throws IllegalArgumentException

lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90DocValuesProducer.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1895,7 +1895,8 @@ public void advance(int target) throws IOException {
18951895
}
18961896
} else {
18971897
// find next interval
1898-
assert target > maxDocID[0] : "target must be bigger that current interval";
1898+
assert target > maxDocID[0]
1899+
: "target " + target + " must be bigger that current interval " + maxDocID[0];
18991900
while (true) {
19001901
levels = input.readByte();
19011902
assert levels <= SKIP_INDEX_MAX_LEVEL && levels > 0
Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
package org.apache.lucene.search;
19+
20+
import java.io.IOException;
21+
import org.apache.lucene.index.DocValuesSkipper;
22+
import org.apache.lucene.util.FixedBitSet;
23+
24+
/**
25+
* A DocIdSetIterator that returns all documents within DocValuesSkipper blocks that have minimum
26+
* and maximum values that fall within a specified range.
27+
*/
28+
public class SkipBlockRangeIterator extends AbstractDocIdSetIterator {
29+
30+
private final DocValuesSkipper skipper;
31+
private final long minValue;
32+
private final long maxValue;
33+
34+
/**
35+
* Creates a new SkipBlockRangeIterator
36+
*
37+
* @param skipper the DocValuesSkipper to use to check block bounds
38+
* @param minValue only return documents that lie within a block with a maximum value greater than
39+
* this
40+
* @param maxValue only return documents that lie within a block with a minimum value less than
41+
* this
42+
*/
43+
public SkipBlockRangeIterator(DocValuesSkipper skipper, long minValue, long maxValue) {
44+
this.skipper = skipper;
45+
this.minValue = minValue;
46+
this.maxValue = maxValue;
47+
}
48+
49+
@Override
50+
public int nextDoc() throws IOException {
51+
return advance(doc + 1);
52+
}
53+
54+
@Override
55+
public int advance(int target) throws IOException {
56+
if (target <= skipper.maxDocID(0)) {
57+
// within current block
58+
if (doc > -1) {
59+
// already positioned, so we've checked bounds and know that we're in a matching block
60+
return doc = target;
61+
}
62+
} else {
63+
// Advance to target
64+
skipper.advance(target);
65+
}
66+
67+
// Find the next matching block (could be the current block)
68+
skipper.advance(minValue, maxValue);
69+
return doc = Math.max(target, skipper.minDocID(0));
70+
}
71+
72+
@Override
73+
public long cost() {
74+
return DocIdSetIterator.NO_MORE_DOCS;
75+
}
76+
77+
@Override
78+
public int docIDRunEnd() throws IOException {
79+
int maxDoc = skipper.maxDocID(0);
80+
int nextLevel = 1;
81+
while (nextLevel < skipper.numLevels()
82+
&& skipper.minValue(nextLevel) < maxValue
83+
&& skipper.maxValue(nextLevel) > minValue) {
84+
maxDoc = skipper.maxDocID(nextLevel);
85+
nextLevel++;
86+
}
87+
return maxDoc + 1;
88+
}
89+
90+
@Override
91+
public void intoBitSet(int upTo, FixedBitSet bitSet, int offset) throws IOException {
92+
while (doc < upTo) {
93+
int end = Math.min(upTo, docIDRunEnd());
94+
bitSet.set(doc - offset, end - offset);
95+
advance(end);
96+
}
97+
}
98+
}

lucene/core/src/java/org/apache/lucene/search/comparators/NumericComparator.java

Lines changed: 5 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -26,13 +26,12 @@
2626
import org.apache.lucene.index.NumericDocValues;
2727
import org.apache.lucene.index.PointValues;
2828
import org.apache.lucene.search.DocIdSetIterator;
29-
import org.apache.lucene.search.DocValuesRangeIterator;
3029
import org.apache.lucene.search.FieldComparator;
3130
import org.apache.lucene.search.LeafFieldComparator;
3231
import org.apache.lucene.search.Pruning;
3332
import org.apache.lucene.search.Scorable;
3433
import org.apache.lucene.search.Scorer;
35-
import org.apache.lucene.search.TwoPhaseIterator;
34+
import org.apache.lucene.search.SkipBlockRangeIterator;
3635
import org.apache.lucene.util.DocIdSetBuilder;
3736
import org.apache.lucene.util.IntsRef;
3837

@@ -502,27 +501,11 @@ private void updateSkipInterval(boolean success) {
502501
private class DVSkipperCompetitiveDISIBuilder extends CompetitiveDISIBuilder {
503502

504503
private final DocValuesSkipper skipper;
505-
private final TwoPhaseIterator innerTwoPhase;
506504

507-
DVSkipperCompetitiveDISIBuilder(DocValuesSkipper skipper, NumericLeafComparator leafComparator)
508-
throws IOException {
505+
DVSkipperCompetitiveDISIBuilder(
506+
DocValuesSkipper skipper, NumericLeafComparator leafComparator) {
509507
super(leafComparator);
510508
this.skipper = skipper;
511-
NumericDocValues docValues =
512-
leafComparator.getNumericDocValues(leafComparator.context, field);
513-
innerTwoPhase =
514-
new TwoPhaseIterator(docValues) {
515-
@Override
516-
public boolean matches() throws IOException {
517-
final long value = docValues.longValue();
518-
return value >= minValueAsLong && value <= maxValueAsLong;
519-
}
520-
521-
@Override
522-
public float matchCost() {
523-
return 2; // 2 comparisons
524-
}
525-
};
526509
postInitializeCompetitiveIterator();
527510
}
528511

@@ -553,9 +536,8 @@ void postInitializeCompetitiveIterator() {
553536

554537
@Override
555538
void doUpdateCompetitiveIterator() {
556-
TwoPhaseIterator twoPhaseIterator =
557-
new DocValuesRangeIterator(innerTwoPhase, skipper, minValueAsLong, maxValueAsLong, false);
558-
competitiveIterator.update(TwoPhaseIterator.asDocIdSetIterator(twoPhaseIterator));
539+
competitiveIterator.update(
540+
new SkipBlockRangeIterator(skipper, minValueAsLong, maxValueAsLong));
559541
}
560542
}
561543
}
Lines changed: 196 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,196 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
package org.apache.lucene.search;
18+
19+
import java.io.IOException;
20+
import org.apache.lucene.index.DocValuesSkipper;
21+
import org.apache.lucene.index.NumericDocValues;
22+
import org.apache.lucene.tests.util.LuceneTestCase;
23+
24+
public abstract class BaseDocValuesSkipperTests extends LuceneTestCase {
25+
26+
/**
27+
* Fake numeric doc values so that: - docs 0-256 all match - docs in 256-512 are all greater than
28+
* queryMax - docs in 512-768 are all less than queryMin - docs in 768-1024 have some docs that
29+
* match the range, others not - docs in 1024-2048 follow a similar pattern as docs in 0-1024
30+
* except that not all docs have a - value
31+
*/
32+
protected static NumericDocValues docValues(long queryMin, long queryMax) {
33+
return new NumericDocValues() {
34+
35+
int doc = -1;
36+
37+
@Override
38+
public boolean advanceExact(int target) throws IOException {
39+
throw new UnsupportedOperationException();
40+
}
41+
42+
@Override
43+
public int docID() {
44+
return doc;
45+
}
46+
47+
@Override
48+
public int nextDoc() throws IOException {
49+
return advance(doc + 1);
50+
}
51+
52+
@Override
53+
public int advance(int target) throws IOException {
54+
if (target < 1024) {
55+
// dense up to 1024
56+
return doc = target;
57+
} else if (doc < 2047) {
58+
// 50% docs have a value up to 2048
59+
return doc = target + (target & 1);
60+
} else {
61+
return doc = DocIdSetIterator.NO_MORE_DOCS;
62+
}
63+
}
64+
65+
@Override
66+
public long longValue() throws IOException {
67+
int d = doc % 1024;
68+
if (d < 128) {
69+
return (queryMin + queryMax) >> 1;
70+
} else if (d < 256) {
71+
return queryMax + 1;
72+
} else if (d < 512) {
73+
return queryMin - 1;
74+
} else {
75+
return switch ((d / 2) % 3) {
76+
case 0 -> queryMin - 1;
77+
case 1 -> queryMax + 1;
78+
case 2 -> (queryMin + queryMax) >> 1;
79+
default -> throw new AssertionError();
80+
};
81+
}
82+
}
83+
84+
@Override
85+
public long cost() {
86+
return 42;
87+
}
88+
};
89+
}
90+
91+
/**
92+
* Fake skipper over a NumericDocValues field built by an equivalent call to {@link
93+
* #docValues(long, long)}
94+
*/
95+
protected static DocValuesSkipper docValuesSkipper(
96+
long queryMin, long queryMax, boolean doLevels) {
97+
return new DocValuesSkipper() {
98+
99+
int doc = -1;
100+
101+
@Override
102+
public void advance(int target) throws IOException {
103+
doc = target;
104+
}
105+
106+
@Override
107+
public int numLevels() {
108+
return doLevels ? 3 : 1;
109+
}
110+
111+
@Override
112+
public int minDocID(int level) {
113+
int rangeLog = 9 - numLevels() + level;
114+
115+
// the level is the log2 of the interval
116+
if (doc < 0) {
117+
return -1;
118+
} else if (doc >= 2048) {
119+
return DocIdSetIterator.NO_MORE_DOCS;
120+
} else {
121+
int mask = (1 << rangeLog) - 1;
122+
// prior multiple of 2^level
123+
return doc & ~mask;
124+
}
125+
}
126+
127+
@Override
128+
public int maxDocID(int level) {
129+
int rangeLog = 9 - numLevels() + level;
130+
131+
int minDocID = minDocID(level);
132+
return switch (minDocID) {
133+
case -1 -> -1;
134+
case DocIdSetIterator.NO_MORE_DOCS -> DocIdSetIterator.NO_MORE_DOCS;
135+
default -> minDocID + (1 << rangeLog) - 1;
136+
};
137+
}
138+
139+
@Override
140+
@SuppressWarnings("DuplicateBranches")
141+
public long minValue(int level) {
142+
int d = doc % 1024;
143+
if (d < 128) {
144+
return queryMin;
145+
} else if (d < 256) {
146+
return queryMax + 1;
147+
} else if (d < 768) {
148+
return queryMin - 1;
149+
} else {
150+
return queryMin - 1;
151+
}
152+
}
153+
154+
@Override
155+
public long maxValue(int level) {
156+
int d = doc % 1024;
157+
if (d < 128) {
158+
return queryMax;
159+
} else if (d < 256) {
160+
return queryMax + 1;
161+
} else if (d < 768) {
162+
return queryMin - 1;
163+
} else {
164+
return queryMax + 1;
165+
}
166+
}
167+
168+
@Override
169+
public int docCount(int level) {
170+
int rangeLog = 9 - numLevels() + level;
171+
172+
if (doc < 1024) {
173+
return 1 << rangeLog;
174+
} else {
175+
// half docs have a value
176+
return 1 << rangeLog >> 1;
177+
}
178+
}
179+
180+
@Override
181+
public long minValue() {
182+
return Long.MIN_VALUE;
183+
}
184+
185+
@Override
186+
public long maxValue() {
187+
return Long.MAX_VALUE;
188+
}
189+
190+
@Override
191+
public int docCount() {
192+
return 1024 + 1024 / 2;
193+
}
194+
};
195+
}
196+
}

0 commit comments

Comments
 (0)