Skip to content

Commit 6b12055

Browse files
authored
Add bulk-retrieval API to NumericDocValues. (#15149)
Lucene recently got very good performance improvements by introducing APIs that apply to batches of doc IDs at once: `DocIdSetIterator#intoBitSet`, `PostingsEnum#nextPostings`, `Scorer#nextDocsAndScores` and `SimScorer#score`. This helps better amortize the cost of virtual function calls across many doc IDs, and also apply additional optimizations, e.g. it's more efficient to bulk-iterate set bits in a `FixedBitSet` than to iterate them one-by-one via `FixedBitSet#nextSetBit`. This PR introduces bulk retrieval for numeric doc values. It is currently only implemented on norms and used to retrieve norms for doc IDs to score, but I tried to design the API in a way that also works for numeric doc values and is sustainable. Specifically, I'm thinking that optimizing the single-valued and dense case should go a very long way, so I did not try to help users retrieve information about which docs have a value or not. In some cases, this is not even needed. E.g. if you want to compute the sum of the values of a field, returning 0 for docs that don't have a value is good. In the event when knowing which docs have a value is important (such as Lucene's `HistogramCollector`), it is still possible to optimize the case when there are long runs of docs with a value with something like below: ```java void doSomethingWith(int size, int[] docs, NumericDocValues values) { if (size > 0 && values.advanceExact(docs[0]) && values.docIDRunEnd() &gt; docs[size - 1]) { long[] longValues = new long[size]; values.longValues(size, docs, longValues); // do something with the `longValues` array } else { // use #advanceExact / #longValue directly for (int i = 0; i < size; i++) { if (values.advanceExact(docs[i])) { // do something with values#longValue } else { // handle the case when docs don't have a value } } } } ```
1 parent 6cbd742 commit 6b12055

File tree

8 files changed

+350
-10
lines changed

8 files changed

+350
-10
lines changed

lucene/CHANGES.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,8 @@ Other
112112

113113
API Changes
114114
---------------------
115-
(No changes)
115+
* GITHUB#15149: Introduce NumericDocValues#longValues to help speed up the
116+
retrieval of many doc values at once. (Adrien Grand)
116117

117118
New Features
118119
---------------------
Lines changed: 191 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,191 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
package org.apache.lucene.benchmark.jmh;
18+
19+
import java.util.Random;
20+
import java.util.concurrent.TimeUnit;
21+
import org.openjdk.jmh.annotations.Benchmark;
22+
import org.openjdk.jmh.annotations.BenchmarkMode;
23+
import org.openjdk.jmh.annotations.Fork;
24+
import org.openjdk.jmh.annotations.Level;
25+
import org.openjdk.jmh.annotations.Measurement;
26+
import org.openjdk.jmh.annotations.Mode;
27+
import org.openjdk.jmh.annotations.OutputTimeUnit;
28+
import org.openjdk.jmh.annotations.Scope;
29+
import org.openjdk.jmh.annotations.Setup;
30+
import org.openjdk.jmh.annotations.State;
31+
import org.openjdk.jmh.annotations.Warmup;
32+
33+
@BenchmarkMode(Mode.Throughput)
34+
@OutputTimeUnit(TimeUnit.MILLISECONDS)
35+
@State(Scope.Benchmark)
36+
@Warmup(iterations = 5, time = 1)
37+
@Measurement(iterations = 5, time = 1)
38+
@Fork(
39+
value = 1,
40+
jvmArgsAppend = {"-Xmx1g", "-Xms1g", "-XX:+AlwaysPreTouch"})
41+
public class PolymorphismBenchmark {
42+
43+
public abstract static class IntList {
44+
45+
public abstract int size();
46+
47+
public abstract int get(int index);
48+
49+
public final long sum1() {
50+
long sum = 0;
51+
for (int i = 0; i < size(); ++i) {
52+
sum += get(i);
53+
}
54+
return sum;
55+
}
56+
57+
public long sum2() {
58+
long sum = 0;
59+
for (int i = 0; i < size(); ++i) {
60+
sum += get(i);
61+
}
62+
return sum;
63+
}
64+
65+
public abstract long sum3();
66+
}
67+
68+
public static class List1 extends IntList {
69+
70+
@Override
71+
public int size() {
72+
return 128;
73+
}
74+
75+
@Override
76+
public int get(int index) {
77+
return 1;
78+
}
79+
80+
@Override
81+
public long sum2() {
82+
return super.sum2();
83+
}
84+
85+
@Override
86+
public long sum3() {
87+
long sum = 0;
88+
for (int i = 0; i < size(); ++i) {
89+
sum += get(i);
90+
}
91+
return sum;
92+
}
93+
}
94+
95+
public static class List2 extends IntList {
96+
97+
@Override
98+
public int size() {
99+
return 128;
100+
}
101+
102+
@Override
103+
public int get(int index) {
104+
return 2;
105+
}
106+
107+
@Override
108+
public long sum2() {
109+
return super.sum2();
110+
}
111+
112+
@Override
113+
public long sum3() {
114+
long sum = 0;
115+
for (int i = 0; i < size(); ++i) {
116+
sum += get(i);
117+
}
118+
return sum;
119+
}
120+
}
121+
122+
public static class List3 extends IntList {
123+
124+
@Override
125+
public int size() {
126+
return 128;
127+
}
128+
129+
@Override
130+
public int get(int index) {
131+
return 3;
132+
}
133+
134+
@Override
135+
public long sum2() {
136+
return super.sum2();
137+
}
138+
139+
@Override
140+
public long sum3() {
141+
long sum = 0;
142+
for (int i = 0; i < size(); ++i) {
143+
sum += get(i);
144+
}
145+
return sum;
146+
}
147+
}
148+
149+
private IntList[] lists;
150+
151+
@Setup(Level.Trial)
152+
public void setup() throws Exception {
153+
lists = new IntList[100];
154+
Random r = new Random(0);
155+
for (int i = 0; i < lists.length; ++i) {
156+
lists[i] =
157+
switch (r.nextInt(4)) {
158+
case 0 -> new List1();
159+
case 1 -> new List2();
160+
default -> new List3();
161+
};
162+
}
163+
}
164+
165+
@Benchmark
166+
public long defaultImpl() {
167+
long sum = 0;
168+
for (IntList list : lists) {
169+
sum += list.sum1();
170+
}
171+
return sum;
172+
}
173+
174+
@Benchmark
175+
public long delegateToDefaultImpl() {
176+
long sum = 0;
177+
for (IntList list : lists) {
178+
sum += list.sum2();
179+
}
180+
return sum;
181+
}
182+
183+
@Benchmark
184+
public long specializedImpl() {
185+
long sum = 0;
186+
for (IntList list : lists) {
187+
sum += list.sum3();
188+
}
189+
return sum;
190+
}
191+
}

lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90NormsProducer.java

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -394,6 +394,14 @@ public long longValue() throws IOException {
394394
public long longValue() throws IOException {
395395
return slice.readByte(doc);
396396
}
397+
398+
@Override
399+
public void longValues(int size, int[] docs, long[] values, long defaultValue)
400+
throws IOException {
401+
// Delegate to help performance: when the super call inlines, calls to
402+
// #advanceExact/#longValue become monomorphic.
403+
super.longValues(size, docs, values, defaultValue);
404+
}
397405
};
398406
case 2:
399407
return new DenseNormsIterator(maxDoc) {
@@ -448,6 +456,14 @@ public long longValue() throws IOException {
448456
public long longValue() throws IOException {
449457
return slice.readByte(disi.index());
450458
}
459+
460+
@Override
461+
public void longValues(int size, int[] docs, long[] values, long defaultValue)
462+
throws IOException {
463+
// Delegate to help performance: when the super call inlines, calls to
464+
// #advanceExact/#longValue become monomorphic.
465+
super.longValues(size, docs, values, defaultValue);
466+
}
451467
};
452468
case 2:
453469
return new SparseNormsIterator(disi) {

lucene/core/src/java/org/apache/lucene/index/CheckIndex.java

Lines changed: 42 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1421,6 +1421,8 @@ public static Status.FieldNormStatus testFieldNorms(
14211421
for (FieldInfo info : reader.getFieldInfos()) {
14221422
if (info.hasNorms()) {
14231423
checkNumericDocValues(info.name, normsReader.getNorms(info), normsReader.getNorms(info));
1424+
checkBulkFetchNumericDocValues(
1425+
info.name, normsReader.getNorms(info), normsReader.getNorms(info), reader.maxDoc());
14241426
++status.totFields;
14251427
}
14261428
}
@@ -3558,7 +3560,7 @@ public static Status.DocValuesStatus testDocValues(
35583560
for (FieldInfo fieldInfo : reader.getFieldInfos()) {
35593561
if (fieldInfo.getDocValuesType() != DocValuesType.NONE) {
35603562
status.totalValueFields++;
3561-
checkDocValues(fieldInfo, dvReader, status);
3563+
checkDocValues(fieldInfo, reader.maxDoc(), dvReader, status);
35623564
}
35633565
}
35643566

@@ -4047,8 +4049,44 @@ private static void checkNumericDocValues(
40474049
}
40484050
}
40494051

4052+
private static void checkBulkFetchNumericDocValues(
4053+
String fieldName, NumericDocValues ndv, NumericDocValues ndv2, int maxDoc)
4054+
throws IOException {
4055+
4056+
int[] docs = new int[16];
4057+
long[] values = new long[16];
4058+
4059+
for (int doc = -1; doc < maxDoc; ) {
4060+
int size = 0;
4061+
for (int j = 0; j < docs.length; ++j) {
4062+
doc += 1 + (j & 0x03);
4063+
if (doc >= maxDoc) {
4064+
break;
4065+
}
4066+
docs[size++] = doc;
4067+
}
4068+
4069+
long defaultValue = 42L;
4070+
ndv.longValues(size, docs, values, defaultValue);
4071+
4072+
for (int j = 0; j < size; ++j) {
4073+
long expected;
4074+
if (ndv2.advanceExact(docs[j])) {
4075+
expected = ndv2.longValue();
4076+
} else {
4077+
expected = defaultValue;
4078+
}
4079+
if (values[j] != expected) {
4080+
throw new CheckIndexException(
4081+
"#longValues reports different value: " + values[j] + " != " + expected);
4082+
}
4083+
}
4084+
}
4085+
}
4086+
40504087
private static void checkDocValues(
4051-
FieldInfo fi, DocValuesProducer dvReader, DocValuesStatus status) throws Exception {
4088+
FieldInfo fi, int maxDoc, DocValuesProducer dvReader, DocValuesStatus status)
4089+
throws Exception {
40524090
if (fi.docValuesSkipIndexType() != DocValuesSkipIndexType.NONE) {
40534091
status.totalSkippingIndex++;
40544092
checkDocValueSkipper(fi, dvReader.getSkipper(fi));
@@ -4079,6 +4117,8 @@ private static void checkDocValues(
40794117
status.totalNumericFields++;
40804118
checkDVIterator(fi, dvReader::getNumeric);
40814119
checkNumericDocValues(fi.name, dvReader.getNumeric(fi), dvReader.getNumeric(fi));
4120+
checkBulkFetchNumericDocValues(
4121+
fi.name, dvReader.getNumeric(fi), dvReader.getNumeric(fi), maxDoc);
40824122
break;
40834123
case NONE:
40844124
default:

lucene/core/src/java/org/apache/lucene/index/NumericDocValues.java

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@
1818
package org.apache.lucene.index;
1919

2020
import java.io.IOException;
21+
import org.apache.lucene.search.BooleanClause.Occur;
22+
import org.apache.lucene.search.FieldExistsQuery;
2123

2224
/** A per-document numeric value. */
2325
public abstract class NumericDocValues extends DocValuesIterator {
@@ -32,4 +34,61 @@ protected NumericDocValues() {}
3234
* @return numeric value
3335
*/
3436
public abstract long longValue() throws IOException;
37+
38+
/**
39+
* Bulk retrieval of numeric doc values. This API helps reduce the performance impact of virtual
40+
* function calls.
41+
*
42+
* <p>This API behaves as if implemented as below, which is the default implementation:
43+
*
44+
* <pre class="prettyprint">
45+
* public void longValues(int size, int[] docs, long[] values, long defaultValue) throws IOException {
46+
* for (int i = 0; i &lt; size; ++i) {
47+
* int doc = docs[i];
48+
* long value;
49+
* if (advanceExact(doc)) {
50+
* value = longValue();
51+
* } else {
52+
* value = defaultValue;
53+
* }
54+
* values[i] = value;
55+
* }
56+
* }
57+
* </pre>
58+
*
59+
* <p><b>NOTE</b>: The {@code docs} array is required to be sorted in ascending order with no
60+
* duplicates.
61+
*
62+
* <p><b>NOTE</b>: This API doesn't allow callers to know which doc IDs have a value or not. If
63+
* you need to exclude documents that don't have a value for this field, then you could apply a
64+
* {@link FieldExistsQuery} as a {@link Occur#FILTER} clause. Another option is to fall back to
65+
* using {@link #advanceExact} and {@link #longValue()} on ranges of doc IDs that may not be
66+
* dense, e.g.
67+
*
68+
* <pre class="prettyprint">
69+
* if (size > 0 &amp;&amp; values.advannceExact(docs[0]) &amp;&amp; values.docIDRunEnd() &gt; docs[size - 1]) {
70+
* // use values#longValues to retrieve values
71+
* } else {
72+
* // some docs may not have a value, use #advanceExact and #longValue
73+
* }
74+
* </pre>
75+
*
76+
* @param size the number of values to retrieve
77+
* @param docs the buffer of doc IDs whose values should be looked up
78+
* @param values the buffer of values to fill
79+
* @param defaultValue the value to put in the buffer when a document doesn't have a value
80+
*/
81+
public void longValues(int size, int[] docs, long[] values, long defaultValue)
82+
throws IOException {
83+
for (int i = 0; i < size; ++i) {
84+
int doc = docs[i];
85+
long value;
86+
if (advanceExact(doc)) {
87+
value = longValue();
88+
} else {
89+
value = defaultValue;
90+
}
91+
values[i] = value;
92+
}
93+
}
3594
}

lucene/core/src/java/org/apache/lucene/search/TermScorer.java

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -160,13 +160,7 @@ public void nextDocsAndScores(int upTo, Bits liveDocs, DocAndFloatFeatureBuffer
160160
}
161161
}
162162
if (norms != null) {
163-
for (int i = 0; i < size; ++i) {
164-
if (norms.advanceExact(buffer.docs[i])) {
165-
normValues[i] = norms.longValue();
166-
} else {
167-
normValues[i] = 1L;
168-
}
169-
}
163+
norms.longValues(size, buffer.docs, normValues, 1L);
170164
}
171165

172166
bulkScorer.score(buffer.size, buffer.features, normValues, buffer.features);

0 commit comments

Comments
 (0)