Skip to content

Commit 2dcfd89

Browse files
jpountzgf2121
andauthored
Use the bulk SimScorer#score API to compute impact scores. (#15151)
In #15039 we introduced a bulk `SimScorer#score` API and used it to compute scores with the leading conjunctive clause and "essential" clauses of disjunctive queries. With this PR, we are now also using this bulk API when translating (term frequency, length normalization factor) pairs into the maximum possible score that a block of postings may produce. To do it right, I had to change the impacts API to no longer return a List of (term freq, norm) pairs, but instead two parallel arrays of term frequencies and norms that could (almost) directly be passed to the `SimScorer#score` bulk API. Unfortunately this makes the change quite big since many backward formats had to be touched. Co-authored-by: Guo Feng <[email protected]>
1 parent e349318 commit 2dcfd89

39 files changed

+589
-563
lines changed

lucene/CHANGES.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,9 @@ Optimizations
131131

132132
* GITHUB#14998: Speed up flushing of live docs. (Adrien Grand)
133133

134+
* GITHUB#15151: Use `SimScorer#score` bulk API to compute impact scores per
135+
block of postings. (Adrien Grand)
136+
134137
Bug Fixes
135138
---------------------
136139
* GITHUB#14161: PointInSetQuery's constructor now throws IllegalArgumentException

lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene101/Lucene101PostingsReader.java

Lines changed: 37 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -27,17 +27,13 @@
2727
import static org.apache.lucene.backward_codecs.lucene101.Lucene101PostingsFormat.VERSION_START;
2828

2929
import java.io.IOException;
30-
import java.util.AbstractList;
3130
import java.util.Arrays;
32-
import java.util.Collections;
33-
import java.util.List;
34-
import java.util.RandomAccess;
3531
import org.apache.lucene.backward_codecs.lucene101.Lucene101PostingsFormat.IntBlockTermState;
3632
import org.apache.lucene.codecs.BlockTermState;
3733
import org.apache.lucene.codecs.CodecUtil;
3834
import org.apache.lucene.codecs.PostingsReaderBase;
3935
import org.apache.lucene.index.FieldInfo;
40-
import org.apache.lucene.index.Impact;
36+
import org.apache.lucene.index.FreqAndNormBuffer;
4137
import org.apache.lucene.index.Impacts;
4238
import org.apache.lucene.index.ImpactsEnum;
4339
import org.apache.lucene.index.IndexFileNames;
@@ -64,17 +60,6 @@
6460
*/
6561
public final class Lucene101PostingsReader extends PostingsReaderBase {
6662

67-
// Dummy impacts, composed of the maximum possible term frequency and the lowest possible
68-
// (unsigned) norm value. This is typically used on tail blocks, which don't actually record
69-
// impacts as the storage overhead would not be worth any query evaluation speedup, since there's
70-
// less than 128 docs left to evaluate anyway.
71-
private static final List<Impact> DUMMY_IMPACTS =
72-
Collections.singletonList(new Impact(Integer.MAX_VALUE, 1L));
73-
74-
// We stopped storing a placeholder impact with freq=1 for fields with DOCS after 9.12.0
75-
private static final List<Impact> DUMMY_IMPACTS_NO_FREQS =
76-
Collections.singletonList(new Impact(1, 1L));
77-
7863
private final IndexInput docIn;
7964
private final IndexInput posIn;
8065
private final IndexInput payIn;
@@ -402,15 +387,15 @@ private enum DeltaEncoding {
402387
private long level0PayEndFP;
403388
private int level0BlockPayUpto;
404389
private final BytesRef level0SerializedImpacts;
405-
private final MutableImpactList level0Impacts;
406390

407391
// level 1 skip data
408392
private long level1PosEndFP;
409393
private int level1BlockPosUpto;
410394
private long level1PayEndFP;
411395
private int level1BlockPayUpto;
412396
private final BytesRef level1SerializedImpacts;
413-
private final MutableImpactList level1Impacts;
397+
398+
private final FreqAndNormBuffer impactBuffer;
414399

415400
// true if we shallow-advanced to a new block that we have not decoded yet
416401
private boolean needsRefilling;
@@ -439,16 +424,24 @@ public BlockPostingsEnum(FieldInfo fieldInfo, int flags, boolean needsImpacts)
439424
Arrays.fill(freqBuffer, 1);
440425
}
441426

427+
if (needsImpacts) {
428+
impactBuffer = new FreqAndNormBuffer();
429+
int capacity = 1; // for dummy impacts
430+
if (needsFreq) {
431+
capacity = Math.max(maxNumImpactsAtLevel0, capacity);
432+
capacity = Math.max(maxNumImpactsAtLevel1, capacity);
433+
}
434+
impactBuffer.growNoCopy(capacity);
435+
} else {
436+
impactBuffer = null;
437+
}
438+
442439
if (needsFreq && needsImpacts) {
443440
level0SerializedImpacts = new BytesRef(maxImpactNumBytesAtLevel0);
444441
level1SerializedImpacts = new BytesRef(maxImpactNumBytesAtLevel1);
445-
level0Impacts = new MutableImpactList(maxNumImpactsAtLevel0);
446-
level1Impacts = new MutableImpactList(maxNumImpactsAtLevel1);
447442
} else {
448443
level0SerializedImpacts = null;
449444
level1SerializedImpacts = null;
450-
level0Impacts = null;
451-
level1Impacts = null;
452445
}
453446

454447
if (needsPos) {
@@ -1318,24 +1311,32 @@ public int getDocIdUpTo(int level) {
13181311
}
13191312

13201313
@Override
1321-
public List<Impact> getImpacts(int level) {
1314+
public FreqAndNormBuffer getImpacts(int level) {
13221315
if (indexHasFreq == false) {
1323-
return DUMMY_IMPACTS_NO_FREQS;
1316+
// Max freq is 1 since freqs are not indexed
1317+
impactBuffer.size = 1;
1318+
impactBuffer.freqs[0] = 1;
1319+
impactBuffer.norms[0] = 1L;
1320+
return impactBuffer;
13241321
}
13251322
if (level == 0 && level0LastDocID != NO_MORE_DOCS) {
1326-
return readImpacts(level0SerializedImpacts, level0Impacts);
1323+
return readImpacts(level0SerializedImpacts, impactBuffer);
13271324
}
13281325
if (level == 1) {
1329-
return readImpacts(level1SerializedImpacts, level1Impacts);
1326+
return readImpacts(level1SerializedImpacts, impactBuffer);
13301327
}
1331-
return DUMMY_IMPACTS;
1328+
impactBuffer.size = 1;
1329+
impactBuffer.freqs[0] = Integer.MAX_VALUE;
1330+
impactBuffer.norms[0] = 1L;
1331+
return impactBuffer;
13321332
}
13331333

1334-
private List<Impact> readImpacts(BytesRef serialized, MutableImpactList impactsList) {
1334+
private FreqAndNormBuffer readImpacts(
1335+
BytesRef serialized, FreqAndNormBuffer impactBuffer) {
13351336
var scratch = this.scratch;
13361337
scratch.reset(serialized.bytes, 0, serialized.length);
1337-
Lucene101PostingsReader.readImpacts(scratch, impactsList);
1338-
return impactsList;
1338+
Lucene101PostingsReader.readImpacts(scratch, impactBuffer);
1339+
return impactBuffer;
13391340
}
13401341
};
13411342

@@ -1379,32 +1380,10 @@ private static void prefetchPostings(IndexInput docIn, IntBlockTermState state)
13791380
// Note: we don't prefetch positions or offsets, which are less likely to be needed.
13801381
}
13811382

1382-
static class MutableImpactList extends AbstractList<Impact> implements RandomAccess {
1383-
int length;
1384-
final Impact[] impacts;
1385-
1386-
MutableImpactList(int capacity) {
1387-
impacts = new Impact[capacity];
1388-
for (int i = 0; i < capacity; ++i) {
1389-
impacts[i] = new Impact(Integer.MAX_VALUE, 1L);
1390-
}
1391-
}
1392-
1393-
@Override
1394-
public Impact get(int index) {
1395-
return impacts[index];
1396-
}
1397-
1398-
@Override
1399-
public int size() {
1400-
return length;
1401-
}
1402-
}
1403-
1404-
static MutableImpactList readImpacts(ByteArrayDataInput in, MutableImpactList reuse) {
1383+
static FreqAndNormBuffer readImpacts(ByteArrayDataInput in, FreqAndNormBuffer reuse) {
14051384
int freq = 0;
14061385
long norm = 0;
1407-
int length = 0;
1386+
int size = 0;
14081387
while (in.getPosition() < in.length()) {
14091388
int freqDelta = in.readVInt();
14101389
if ((freqDelta & 0x01) != 0) {
@@ -1418,12 +1397,11 @@ static MutableImpactList readImpacts(ByteArrayDataInput in, MutableImpactList re
14181397
freq += 1 + (freqDelta >>> 1);
14191398
norm++;
14201399
}
1421-
Impact impact = reuse.impacts[length];
1422-
impact.freq = freq;
1423-
impact.norm = norm;
1424-
length++;
1400+
reuse.freqs[size] = freq;
1401+
reuse.norms[size] = norm;
1402+
size++;
14251403
}
1426-
reuse.length = length;
1404+
reuse.size = size;
14271405
return reuse;
14281406
}
14291407

lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene50/Lucene50ScoreSkipReader.java

Lines changed: 16 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -17,11 +17,8 @@
1717
package org.apache.lucene.backward_codecs.lucene50;
1818

1919
import java.io.IOException;
20-
import java.util.AbstractList;
2120
import java.util.Arrays;
22-
import java.util.List;
23-
import java.util.RandomAccess;
24-
import org.apache.lucene.index.Impact;
21+
import org.apache.lucene.index.FreqAndNormBuffer;
2522
import org.apache.lucene.index.Impacts;
2623
import org.apache.lucene.store.ByteArrayDataInput;
2724
import org.apache.lucene.store.IndexInput;
@@ -34,7 +31,7 @@ final class Lucene50ScoreSkipReader extends Lucene50SkipReader {
3431
private final ByteArrayDataInput badi = new ByteArrayDataInput();
3532
private final Impacts impacts;
3633
private int numLevels = 1;
37-
private final MutableImpactList[] perLevelImpacts;
34+
private final FreqAndNormBuffer[] perLevelImpacts;
3835

3936
public Lucene50ScoreSkipReader(
4037
int version,
@@ -50,9 +47,10 @@ public Lucene50ScoreSkipReader(
5047
this.impactData = new byte[maxSkipLevels][];
5148
Arrays.fill(impactData, new byte[0]);
5249
this.impactDataLength = new int[maxSkipLevels];
53-
this.perLevelImpacts = new MutableImpactList[maxSkipLevels];
50+
this.perLevelImpacts = new FreqAndNormBuffer[maxSkipLevels];
5451
for (int i = 0; i < perLevelImpacts.length; ++i) {
55-
perLevelImpacts[i] = new MutableImpactList();
52+
perLevelImpacts[i] = new FreqAndNormBuffer();
53+
perLevelImpacts[i].add(Integer.MAX_VALUE, 1L);
5654
}
5755
impacts =
5856
new Impacts() {
@@ -68,7 +66,7 @@ public int getDocIdUpTo(int level) {
6866
}
6967

7068
@Override
71-
public List<Impact> getImpacts(int level) {
69+
public FreqAndNormBuffer getImpacts(int level) {
7270
assert level < numLevels;
7371
if (impactDataLength[level] > 0) {
7472
badi.reset(impactData[level], 0, impactDataLength[level]);
@@ -89,9 +87,9 @@ public int skipTo(int target) throws IOException {
8987
// End of postings don't have skip data anymore, so we fill with dummy data
9088
// like SlowImpactsEnum.
9189
numLevels = 1;
92-
perLevelImpacts[0].length = 1;
93-
perLevelImpacts[0].impacts[0].freq = Integer.MAX_VALUE;
94-
perLevelImpacts[0].impacts[0].norm = 1L;
90+
perLevelImpacts[0].size = 1;
91+
perLevelImpacts[0].freqs[0] = Integer.MAX_VALUE;
92+
perLevelImpacts[0].norms[0] = 1L;
9593
impactDataLength[0] = 0;
9694
}
9795
return result;
@@ -111,19 +109,13 @@ protected void readImpacts(int level, IndexInput skipStream) throws IOException
111109
impactDataLength[level] = length;
112110
}
113111

114-
static MutableImpactList readImpacts(ByteArrayDataInput in, MutableImpactList reuse) {
112+
static FreqAndNormBuffer readImpacts(ByteArrayDataInput in, FreqAndNormBuffer reuse) {
115113
int maxNumImpacts = in.length(); // at most one impact per byte
116-
if (reuse.impacts.length < maxNumImpacts) {
117-
int oldLength = reuse.impacts.length;
118-
reuse.impacts = ArrayUtil.grow(reuse.impacts, maxNumImpacts);
119-
for (int i = oldLength; i < reuse.impacts.length; ++i) {
120-
reuse.impacts[i] = new Impact(Integer.MAX_VALUE, 1L);
121-
}
122-
}
114+
reuse.growNoCopy(maxNumImpacts);
123115

124116
int freq = 0;
125117
long norm = 0;
126-
int length = 0;
118+
int size = 0;
127119
while (in.getPosition() < in.length()) {
128120
int freqDelta = in.readVInt();
129121
if ((freqDelta & 0x01) != 0) {
@@ -137,27 +129,11 @@ static MutableImpactList readImpacts(ByteArrayDataInput in, MutableImpactList re
137129
freq += 1 + (freqDelta >>> 1);
138130
norm++;
139131
}
140-
Impact impact = reuse.impacts[length];
141-
impact.freq = freq;
142-
impact.norm = norm;
143-
length++;
132+
reuse.freqs[size] = freq;
133+
reuse.norms[size] = norm;
134+
size++;
144135
}
145-
reuse.length = length;
136+
reuse.size = size;
146137
return reuse;
147138
}
148-
149-
static class MutableImpactList extends AbstractList<Impact> implements RandomAccess {
150-
int length = 1;
151-
Impact[] impacts = new Impact[] {new Impact(Integer.MAX_VALUE, 1L)};
152-
153-
@Override
154-
public Impact get(int index) {
155-
return impacts[index];
156-
}
157-
158-
@Override
159-
public int size() {
160-
return length;
161-
}
162-
}
163139
}

0 commit comments

Comments
 (0)