Skip to content

Commit 1cb1a14

Browse files
committed
Speed up the sort when building forward index (#12712)
1 parent 8b38b73 commit 1cb1a14

File tree

3 files changed

+291
-99
lines changed

3 files changed

+291
-99
lines changed

lucene/CHANGES.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -229,6 +229,8 @@ Optimizations
229229

230230
* GITHUB#12710: Use Arrays#mismatch for Outputs#common operations. (Guo Feng)
231231

232+
* GITHUB#12712: Speed up sorting postings file with an offline radix sorter in BPIndexReader. (Guo Feng)
233+
232234
Changes in runtime behavior
233235
---------------------
234236

lucene/misc/src/java/org/apache/lucene/misc/index/BPIndexReorderer.java

Lines changed: 213 additions & 99 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@
3535
import org.apache.lucene.index.Terms;
3636
import org.apache.lucene.index.TermsEnum;
3737
import org.apache.lucene.search.DocIdSetIterator;
38-
import org.apache.lucene.store.ChecksumIndexInput;
38+
import org.apache.lucene.store.ByteBuffersDataOutput;
3939
import org.apache.lucene.store.DataInput;
4040
import org.apache.lucene.store.DataOutput;
4141
import org.apache.lucene.store.Directory;
@@ -46,13 +46,11 @@
4646
import org.apache.lucene.store.TrackingDirectoryWrapper;
4747
import org.apache.lucene.util.ArrayUtil;
4848
import org.apache.lucene.util.BytesRef;
49-
import org.apache.lucene.util.BytesRefComparator;
5049
import org.apache.lucene.util.CloseableThreadLocal;
5150
import org.apache.lucene.util.IOUtils;
5251
import org.apache.lucene.util.IntroSorter;
5352
import org.apache.lucene.util.IntsRef;
54-
import org.apache.lucene.util.OfflineSorter;
55-
import org.apache.lucene.util.OfflineSorter.BufferSize;
53+
import org.apache.lucene.util.packed.PackedInts;
5654

5755
/**
5856
* Implementation of "recursive graph bisection", also called "bipartite graph partitioning" and
@@ -654,9 +652,7 @@ private int writePostings(
654652
for (int doc = postings.nextDoc();
655653
doc != DocIdSetIterator.NO_MORE_DOCS;
656654
doc = postings.nextDoc()) {
657-
// reverse bytes so that byte order matches natural order
658-
postingsOut.writeInt(Integer.reverseBytes(doc));
659-
postingsOut.writeInt(Integer.reverseBytes(termID));
655+
postingsOut.writeLong(Integer.toUnsignedLong(termID) << 32 | Integer.toUnsignedLong(doc));
660656
}
661657
}
662658
}
@@ -665,107 +661,60 @@ private int writePostings(
665661

666662
private ForwardIndex buildForwardIndex(
667663
Directory tempDir, String postingsFileName, int maxDoc, int maxTerm) throws IOException {
668-
String sortedPostingsFile =
669-
new OfflineSorter(
670-
tempDir,
671-
"forward-index",
672-
// Implement BytesRefComparator to make OfflineSorter use radix sort
673-
new BytesRefComparator(2 * Integer.BYTES) {
674-
@Override
675-
protected int byteAt(BytesRef ref, int i) {
676-
return ref.bytes[ref.offset + i] & 0xFF;
677-
}
678-
679-
@Override
680-
public int compare(BytesRef o1, BytesRef o2, int k) {
681-
assert o1.length == 2 * Integer.BYTES;
682-
assert o2.length == 2 * Integer.BYTES;
683-
return ArrayUtil.compareUnsigned8(o1.bytes, o1.offset, o2.bytes, o2.offset);
684-
}
685-
},
686-
BufferSize.megabytes((long) (ramBudgetMB / getParallelism())),
687-
OfflineSorter.MAX_TEMPFILES,
688-
2 * Integer.BYTES,
689-
forkJoinPool,
690-
getParallelism()) {
691-
692-
@Override
693-
protected ByteSequencesReader getReader(ChecksumIndexInput in, String name)
694-
throws IOException {
695-
return new ByteSequencesReader(in, postingsFileName) {
696-
{
697-
ref.grow(2 * Integer.BYTES);
698-
ref.setLength(2 * Integer.BYTES);
699-
}
700-
701-
@Override
702-
public BytesRef next() throws IOException {
703-
if (in.getFilePointer() >= end) {
704-
return null;
705-
}
706-
// optimized read of 8 bytes
707-
in.readBytes(ref.bytes(), 0, 2 * Integer.BYTES);
708-
return ref.get();
709-
}
710-
};
711-
}
712-
713-
@Override
714-
protected ByteSequencesWriter getWriter(IndexOutput out, long itemCount)
715-
throws IOException {
716-
return new ByteSequencesWriter(out) {
717-
@Override
718-
public void write(byte[] bytes, int off, int len) throws IOException {
719-
assert len == 2 * Integer.BYTES;
720-
// optimized read of 8 bytes
721-
out.writeBytes(bytes, off, len);
722-
}
723-
};
724-
}
725-
}.sort(postingsFileName);
726664

727665
String termIDsFileName;
728666
String startOffsetsFileName;
729-
int prevDoc = -1;
730-
try (IndexInput sortedPostings = tempDir.openInput(sortedPostingsFile, IOContext.READONCE);
731-
IndexOutput termIDs = tempDir.createTempOutput("term-ids", "", IOContext.DEFAULT);
667+
try (IndexOutput termIDs = tempDir.createTempOutput("term-ids", "", IOContext.DEFAULT);
732668
IndexOutput startOffsets =
733669
tempDir.createTempOutput("start-offsets", "", IOContext.DEFAULT)) {
734670
termIDsFileName = termIDs.getName();
735671
startOffsetsFileName = startOffsets.getName();
736-
final long end = sortedPostings.length() - CodecUtil.footerLength();
737672
int[] buffer = new int[TERM_IDS_BLOCK_SIZE];
738-
int bufferLen = 0;
739-
while (sortedPostings.getFilePointer() < end) {
740-
final int doc = Integer.reverseBytes(sortedPostings.readInt());
741-
final int termID = Integer.reverseBytes(sortedPostings.readInt());
742-
if (doc != prevDoc) {
743-
if (bufferLen != 0) {
744-
writeMonotonicInts(buffer, bufferLen, termIDs);
745-
bufferLen = 0;
746-
}
673+
new ForwardIndexSorter(tempDir)
674+
.sortAndConsume(
675+
postingsFileName,
676+
maxDoc,
677+
new LongConsumer() {
678+
679+
int prevDoc = -1;
680+
int bufferLen = 0;
681+
682+
@Override
683+
public void accept(long value) throws IOException {
684+
int doc = (int) value;
685+
int termID = (int) (value >>> 32);
686+
if (doc != prevDoc) {
687+
if (bufferLen != 0) {
688+
writeMonotonicInts(buffer, bufferLen, termIDs);
689+
bufferLen = 0;
690+
}
691+
692+
assert doc > prevDoc;
693+
for (int d = prevDoc + 1; d <= doc; ++d) {
694+
startOffsets.writeLong(termIDs.getFilePointer());
695+
}
696+
prevDoc = doc;
697+
}
698+
assert termID < maxTerm : termID + " " + maxTerm;
699+
if (bufferLen == buffer.length) {
700+
writeMonotonicInts(buffer, bufferLen, termIDs);
701+
bufferLen = 0;
702+
}
703+
buffer[bufferLen++] = termID;
704+
}
747705

748-
assert doc > prevDoc;
749-
for (int d = prevDoc + 1; d <= doc; ++d) {
750-
startOffsets.writeLong(termIDs.getFilePointer());
751-
}
752-
prevDoc = doc;
753-
}
754-
assert termID < maxTerm : termID + " " + maxTerm;
755-
if (bufferLen == buffer.length) {
756-
writeMonotonicInts(buffer, bufferLen, termIDs);
757-
bufferLen = 0;
758-
}
759-
buffer[bufferLen++] = termID;
760-
}
761-
if (bufferLen != 0) {
762-
writeMonotonicInts(buffer, bufferLen, termIDs);
763-
}
764-
for (int d = prevDoc + 1; d <= maxDoc; ++d) {
765-
startOffsets.writeLong(termIDs.getFilePointer());
766-
}
767-
CodecUtil.writeFooter(termIDs);
768-
CodecUtil.writeFooter(startOffsets);
706+
@Override
707+
public void onFinish() throws IOException {
708+
if (bufferLen != 0) {
709+
writeMonotonicInts(buffer, bufferLen, termIDs);
710+
}
711+
for (int d = prevDoc + 1; d <= maxDoc; ++d) {
712+
startOffsets.writeLong(termIDs.getFilePointer());
713+
}
714+
CodecUtil.writeFooter(termIDs);
715+
CodecUtil.writeFooter(startOffsets);
716+
}
717+
});
769718
}
770719

771720
IndexInput termIDsInput = tempDir.openInput(termIDsFileName, IOContext.READ);
@@ -991,4 +940,169 @@ static int readMonotonicInts(DataInput in, int[] ints) throws IOException {
991940
}
992941
return len;
993942
}
943+
944+
/**
945+
* Use a LSB Radix Sorter to sort the (docID, termID) entries. We only need to compare docIds
946+
* because LSB Radix Sorter is stable and termIDs already sorted.
947+
*
948+
* <p>This sorter will require at least 16MB ({@link #BUFFER_BYTES} * {@link #HISTOGRAM_SIZE})
949+
* RAM.
950+
*/
951+
static class ForwardIndexSorter {
952+
953+
private static final int HISTOGRAM_SIZE = 256;
954+
private static final int BUFFER_SIZE = 8192;
955+
private static final int BUFFER_BYTES = BUFFER_SIZE * Long.BYTES;
956+
private final Directory directory;
957+
private final Bucket[] buckets = new Bucket[HISTOGRAM_SIZE];
958+
959+
private static class Bucket {
960+
private final ByteBuffersDataOutput fps = new ByteBuffersDataOutput();
961+
private final long[] buffer = new long[BUFFER_SIZE];
962+
private IndexOutput output;
963+
private int bufferUsed;
964+
private int blockNum;
965+
private long lastFp;
966+
private int finalBlockSize;
967+
968+
private void addEntry(long l) throws IOException {
969+
buffer[bufferUsed++] = l;
970+
if (bufferUsed == BUFFER_SIZE) {
971+
flush(false);
972+
}
973+
}
974+
975+
private void flush(boolean isFinal) throws IOException {
976+
if (isFinal) {
977+
finalBlockSize = bufferUsed;
978+
}
979+
long fp = output.getFilePointer();
980+
fps.writeVLong(encode(fp - lastFp));
981+
lastFp = fp;
982+
for (int i = 0; i < bufferUsed; i++) {
983+
output.writeLong(buffer[i]);
984+
}
985+
lastFp = fp;
986+
blockNum++;
987+
bufferUsed = 0;
988+
}
989+
990+
private void reset(IndexOutput resetOutput) {
991+
output = resetOutput;
992+
finalBlockSize = 0;
993+
bufferUsed = 0;
994+
blockNum = 0;
995+
lastFp = 0;
996+
fps.reset();
997+
}
998+
}
999+
1000+
private static long encode(long fpDelta) {
1001+
assert (fpDelta & 0x07) == 0 : "fpDelta should be multiple of 8";
1002+
if (fpDelta % BUFFER_BYTES == 0) {
1003+
return ((fpDelta / BUFFER_BYTES) << 1) | 1;
1004+
} else {
1005+
return fpDelta;
1006+
}
1007+
}
1008+
1009+
private static long decode(long fpDelta) {
1010+
if ((fpDelta & 1) == 1) {
1011+
return (fpDelta >>> 1) * BUFFER_BYTES;
1012+
} else {
1013+
return fpDelta;
1014+
}
1015+
}
1016+
1017+
ForwardIndexSorter(Directory directory) {
1018+
this.directory = directory;
1019+
for (int i = 0; i < HISTOGRAM_SIZE; i++) {
1020+
buckets[i] = new Bucket();
1021+
}
1022+
}
1023+
1024+
private void consume(String fileName, LongConsumer consumer) throws IOException {
1025+
try (IndexInput in = directory.openInput(fileName, IOContext.READONCE)) {
1026+
final long end = in.length() - CodecUtil.footerLength();
1027+
while (in.getFilePointer() < end) {
1028+
consumer.accept(in.readLong());
1029+
}
1030+
}
1031+
consumer.onFinish();
1032+
}
1033+
1034+
private void consume(String fileName, long indexFP, LongConsumer consumer) throws IOException {
1035+
try (IndexInput index = directory.openInput(fileName, IOContext.READONCE);
1036+
IndexInput value = directory.openInput(fileName, IOContext.READONCE)) {
1037+
index.seek(indexFP);
1038+
for (int i = 0; i < buckets.length; i++) {
1039+
int blockNum = index.readVInt();
1040+
int finalBlockSize = index.readVInt();
1041+
long fp = decode(index.readVLong());
1042+
for (int block = 0; block < blockNum - 1; block++) {
1043+
value.seek(fp);
1044+
for (int j = 0; j < BUFFER_SIZE; j++) {
1045+
consumer.accept(value.readLong());
1046+
}
1047+
fp += decode(index.readVLong());
1048+
}
1049+
value.seek(fp);
1050+
for (int j = 0; j < finalBlockSize; j++) {
1051+
consumer.accept(value.readLong());
1052+
}
1053+
}
1054+
consumer.onFinish();
1055+
}
1056+
}
1057+
1058+
private LongConsumer consumer(int shift) {
1059+
return new LongConsumer() {
1060+
@Override
1061+
public void accept(long value) throws IOException {
1062+
int b = (int) ((value >>> shift) & 0xFF);
1063+
Bucket bucket = buckets[b];
1064+
bucket.addEntry(value);
1065+
}
1066+
1067+
@Override
1068+
public void onFinish() throws IOException {
1069+
for (Bucket bucket : buckets) {
1070+
bucket.flush(true);
1071+
}
1072+
}
1073+
};
1074+
}
1075+
1076+
void sortAndConsume(String fileName, int maxDoc, LongConsumer consumer) throws IOException {
1077+
int bitsRequired = PackedInts.bitsRequired(maxDoc);
1078+
String sourceFileName = fileName;
1079+
long indexFP = -1;
1080+
for (int shift = 0; shift < bitsRequired; shift += 8) {
1081+
try (IndexOutput output = directory.createTempOutput(fileName, "sort", IOContext.DEFAULT)) {
1082+
Arrays.stream(buckets).forEach(b -> b.reset(output));
1083+
if (shift == 0) {
1084+
consume(sourceFileName, consumer(shift));
1085+
} else {
1086+
consume(sourceFileName, indexFP, consumer(shift));
1087+
directory.deleteFile(sourceFileName);
1088+
}
1089+
indexFP = output.getFilePointer();
1090+
for (Bucket bucket : buckets) {
1091+
output.writeVInt(bucket.blockNum);
1092+
output.writeVInt(bucket.finalBlockSize);
1093+
bucket.fps.copyTo(output);
1094+
}
1095+
CodecUtil.writeFooter(output);
1096+
sourceFileName = output.getName();
1097+
}
1098+
}
1099+
consume(sourceFileName, indexFP, consumer);
1100+
}
1101+
}
1102+
1103+
interface LongConsumer {
1104+
void accept(long value) throws IOException;
1105+
1106+
default void onFinish() throws IOException {}
1107+
}
9941108
}

0 commit comments

Comments
 (0)