Skip to content

Commit 50607e0

Browse files
committed
LUCENE-9935: Enable bulk-merge for term vectors with index sort (#140)
This change enables bulk-merge for term vectors with index sort. The algorithm used here is similar to the one that is used to merge stored fields. Relates #134
1 parent 3bedc08 commit 50607e0

File tree

3 files changed

+276
-145
lines changed

3 files changed

+276
-145
lines changed

lucene/core/src/java/org/apache/lucene/codecs/lucene90/compressing/Lucene90CompressingTermVectorsReader.java

Lines changed: 26 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,7 @@ public final class Lucene90CompressingTermVectorsReader extends TermVectorsReade
9191
private final long numDirtyChunks; // number of incomplete compressed blocks written
9292
private final long numDirtyDocs; // cumulative number of docs in incomplete chunks
9393
private final long maxPointer; // end of the data section
94+
private BlockState blockState = new BlockState(-1, -1, 0);
9495

9596
// used by clone
9697
private Lucene90CompressingTermVectorsReader(Lucene90CompressingTermVectorsReader reader) {
@@ -310,25 +311,46 @@ private static RandomAccessInput slice(IndexInput in) throws IOException {
310311
return new ByteBuffersDataInput(Collections.singletonList(ByteBuffer.wrap(bytes)));
311312
}
312313

314+
/** Checks if a given docID was loaded in the current block state. */
315+
boolean isLoaded(int docID) {
316+
return blockState.docBase <= docID && docID < blockState.docBase + blockState.chunkDocs;
317+
}
318+
319+
private static class BlockState {
320+
final long startPointer;
321+
final int docBase;
322+
final int chunkDocs;
323+
324+
BlockState(long startPointer, int docBase, int chunkDocs) {
325+
this.startPointer = startPointer;
326+
this.docBase = docBase;
327+
this.chunkDocs = chunkDocs;
328+
}
329+
}
330+
313331
@Override
314332
public Fields get(int doc) throws IOException {
315333
ensureOpen();
316334

317335
// seek to the right place
318-
{
319-
final long startPointer = indexReader.getStartPointer(doc);
320-
vectorsStream.seek(startPointer);
336+
final long startPointer;
337+
if (isLoaded(doc)) {
338+
startPointer = blockState.startPointer; // avoid searching the start pointer
339+
} else {
340+
startPointer = indexReader.getStartPointer(doc);
321341
}
342+
vectorsStream.seek(startPointer);
322343

323344
// decode
324345
// - docBase: first doc ID of the chunk
325346
// - chunkDocs: number of docs of the chunk
326347
final int docBase = vectorsStream.readVInt();
327-
final int chunkDocs = vectorsStream.readVInt();
348+
final int chunkDocs = vectorsStream.readVInt() >>> 1;
328349
if (doc < docBase || doc >= docBase + chunkDocs || docBase + chunkDocs > numDocs) {
329350
throw new CorruptIndexException(
330351
"docBase=" + docBase + ",chunkDocs=" + chunkDocs + ",doc=" + doc, vectorsStream);
331352
}
353+
this.blockState = new BlockState(startPointer, docBase, chunkDocs);
332354

333355
final int skip; // number of fields to skip
334356
final int numFields; // number of fields of the document we're looking for

0 commit comments

Comments
 (0)