Skip to content

Commit 41e2993

Browse files
committed
Backport: Move group-varint encoding/decoding logic to DataOutput/DataInput (#12841)
1 parent 86573e5 commit 41e2993

File tree

16 files changed

+371
-180
lines changed

16 files changed

+371
-180
lines changed

lucene/core/src/java/org/apache/lucene/codecs/lucene99/GroupVIntReader.java

Lines changed: 0 additions & 57 deletions
This file was deleted.

lucene/core/src/java/org/apache/lucene/codecs/lucene99/GroupVIntWriter.java

Lines changed: 0 additions & 63 deletions
This file was deleted.

lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99PostingsReader.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -149,7 +149,7 @@ static void readVIntBlock(
149149
boolean indexHasFreq,
150150
boolean decodeFreq)
151151
throws IOException {
152-
GroupVIntReader.readValues(docIn, docBuffer, num);
152+
docIn.readGroupVInts(docBuffer, num);
153153
if (indexHasFreq && decodeFreq) {
154154
for (int i = 0; i < num; ++i) {
155155
freqBuffer[i] = docBuffer[i] & 0x01;

lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99PostingsWriter.java

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,6 @@ public final class Lucene99PostingsWriter extends PushPostingsWriterBase {
9292
private final PForUtil pforUtil;
9393
private final ForDeltaUtil forDeltaUtil;
9494
private final Lucene99SkipWriter skipWriter;
95-
private final GroupVIntWriter docGroupVIntWriter;
9695

9796
private boolean fieldHasNorms;
9897
private NumericDocValues norms;
@@ -173,7 +172,6 @@ public Lucene99PostingsWriter(SegmentWriteState state) throws IOException {
173172
skipWriter =
174173
new Lucene99SkipWriter(
175174
MAX_SKIP_LEVELS, BLOCK_SIZE, state.segmentInfo.maxDoc(), docOut, posOut, payOut);
176-
docGroupVIntWriter = new GroupVIntWriter();
177175
}
178176

179177
@Override
@@ -378,7 +376,7 @@ public void finishTerm(BlockTermState _state) throws IOException {
378376
docDeltaBuffer[i] = (docDeltaBuffer[i] << 1) | (freqBuffer[i] == 1 ? 1 : 0);
379377
}
380378
}
381-
docGroupVIntWriter.writeValues(docOut, docDeltaBuffer, docBufferUpto);
379+
docOut.writeGroupVInts(docDeltaBuffer, docBufferUpto);
382380
if (writeFreqs) {
383381
for (int i = 0; i < docBufferUpto; i++) {
384382
final int freq = (int) freqBuffer[i];

lucene/core/src/java/org/apache/lucene/store/BufferedIndexInput.java

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
import java.io.IOException;
2121
import java.nio.ByteBuffer;
2222
import java.nio.ByteOrder;
23+
import org.apache.lucene.util.GroupVIntUtil;
2324

2425
/** Base implementation class for buffered {@link IndexInput}. */
2526
public abstract class BufferedIndexInput extends IndexInput implements RandomAccessInput {
@@ -149,6 +150,16 @@ public final int readInt() throws IOException {
149150
}
150151
}
151152

153+
@Override
154+
protected void readGroupVInt(long[] dst, int offset) throws IOException {
155+
final int len =
156+
GroupVIntUtil.readGroupVInt(
157+
this, buffer.remaining(), p -> buffer.getInt((int) p), buffer.position(), dst, offset);
158+
if (len > 0) {
159+
buffer.position(buffer.position() + len);
160+
}
161+
}
162+
152163
@Override
153164
public final long readLong() throws IOException {
154165
if (Long.BYTES <= buffer.remaining()) {

lucene/core/src/java/org/apache/lucene/store/ByteBuffersDataInput.java

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
import java.util.Locale;
3030
import java.util.stream.Collectors;
3131
import org.apache.lucene.util.Accountable;
32+
import org.apache.lucene.util.GroupVIntUtil;
3233
import org.apache.lucene.util.RamUsageEstimator;
3334

3435
/**
@@ -212,6 +213,25 @@ public long readLong() throws IOException {
212213
}
213214
}
214215

216+
@Override
217+
protected void readGroupVInt(long[] dst, int offset) throws IOException {
218+
final ByteBuffer block = blocks[blockIndex(pos)];
219+
final int blockOffset = blockOffset(pos);
220+
// We MUST save the return value to local variable, could not use pos += readGroupVInt(...).
221+
// because `pos +=` in java will move current value(not address) of pos to register first,
222+
// then call the function, but we will update pos value in function via readByte(), then
223+
// `pos +=` will use an old pos value plus return value, thereby missing 1 byte.
224+
final int len =
225+
GroupVIntUtil.readGroupVInt(
226+
this,
227+
block.limit() - blockOffset,
228+
p -> block.getInt((int) p),
229+
blockOffset,
230+
dst,
231+
offset);
232+
pos += len;
233+
}
234+
215235
@Override
216236
public long length() {
217237
return length;

lucene/core/src/java/org/apache/lucene/store/ByteBuffersIndexInput.java

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -199,6 +199,12 @@ public void readLongs(long[] dst, int offset, int length) throws IOException {
199199
in.readLongs(dst, offset, length);
200200
}
201201

202+
@Override
203+
protected void readGroupVInt(long[] dst, int offset) throws IOException {
204+
ensureOpen();
205+
in.readGroupVInt(dst, offset);
206+
}
207+
202208
@Override
203209
public IndexInput clone() {
204210
ensureOpen();

lucene/core/src/java/org/apache/lucene/store/DataInput.java

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
import java.util.TreeMap;
2828
import java.util.TreeSet;
2929
import org.apache.lucene.util.BitUtil;
30+
import org.apache.lucene.util.GroupVIntUtil;
3031

3132
/**
3233
* Abstract base class for performing read operations of Lucene's low-level data types.
@@ -98,6 +99,32 @@ public int readInt() throws IOException {
9899
return ((b4 & 0xFF) << 24) | ((b3 & 0xFF) << 16) | ((b2 & 0xFF) << 8) | (b1 & 0xFF);
99100
}
100101

102+
/**
103+
* Read all the group varints, including the tail vints. we need a long[] because this is what
104+
* postings are using, all longs are actually required to be integers.
105+
*
106+
* @param dst the array to read ints into.
107+
* @param limit the number of int values to read.
108+
* @lucene.experimental
109+
*/
110+
public final void readGroupVInts(long[] dst, int limit) throws IOException {
111+
int i;
112+
for (i = 0; i <= limit - 4; i += 4) {
113+
readGroupVInt(dst, i);
114+
}
115+
for (; i < limit; ++i) {
116+
dst[i] = readVInt();
117+
}
118+
}
119+
120+
/**
121+
* Override if you have a efficient implementation. In general this is when the input supports
122+
* random access.
123+
*/
124+
protected void readGroupVInt(long[] dst, int offset) throws IOException {
125+
GroupVIntUtil.readGroupVInt(this, dst, offset);
126+
}
127+
101128
/**
102129
* Reads an int stored in variable-length format. Reads between one and five bytes. Smaller values
103130
* take fewer bytes. Negative numbers are supported, but should be avoided.

lucene/core/src/java/org/apache/lucene/store/DataOutput.java

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
import java.util.Set;
2222
import org.apache.lucene.util.BitUtil;
2323
import org.apache.lucene.util.BytesRef;
24+
import org.apache.lucene.util.BytesRefBuilder;
2425

2526
/**
2627
* Abstract base class for performing write operations of Lucene's low-level data types.
@@ -29,6 +30,7 @@
2930
* internal state like file position).
3031
*/
3132
public abstract class DataOutput {
33+
private final BytesRefBuilder groupVIntBytes = new BytesRefBuilder();
3234

3335
/**
3436
* Writes a single byte.
@@ -322,4 +324,43 @@ public void writeSetOfStrings(Set<String> set) throws IOException {
322324
writeString(value);
323325
}
324326
}
327+
328+
/**
329+
* Encode integers using group-varint. It uses {@link DataOutput#writeVInt VInt} to encode tail
330+
* values that are not enough for a group. we need a long[] because this is what postings are
331+
* using, all longs are actually required to be integers.
332+
*
333+
* @param values the values to write
334+
* @param limit the number of values to write.
335+
* @lucene.experimental
336+
*/
337+
public void writeGroupVInts(long[] values, int limit) throws IOException {
338+
int off = 0;
339+
340+
// encode each group
341+
while ((limit - off) >= 4) {
342+
byte flag = 0;
343+
groupVIntBytes.setLength(1);
344+
flag |= (encodeGroupValue(Math.toIntExact(values[off++])) - 1) << 6;
345+
flag |= (encodeGroupValue(Math.toIntExact(values[off++])) - 1) << 4;
346+
flag |= (encodeGroupValue(Math.toIntExact(values[off++])) - 1) << 2;
347+
flag |= (encodeGroupValue(Math.toIntExact(values[off++])) - 1);
348+
groupVIntBytes.setByteAt(0, flag);
349+
writeBytes(groupVIntBytes.bytes(), groupVIntBytes.length());
350+
}
351+
352+
// tail vints
353+
for (; off < limit; off++) {
354+
writeVInt(Math.toIntExact(values[off]));
355+
}
356+
}
357+
358+
private int encodeGroupValue(int v) {
359+
int lastOff = groupVIntBytes.length();
360+
do {
361+
groupVIntBytes.append((byte) (v & 0xFF));
362+
v >>>= 8;
363+
} while (v != 0);
364+
return groupVIntBytes.length() - lastOff;
365+
}
325366
}

0 commit comments

Comments
 (0)