3535import org .apache .lucene .index .Terms ;
3636import org .apache .lucene .index .TermsEnum ;
3737import org .apache .lucene .search .DocIdSetIterator ;
38- import org .apache .lucene .store .ChecksumIndexInput ;
38+ import org .apache .lucene .store .ByteBuffersDataOutput ;
3939import org .apache .lucene .store .DataInput ;
4040import org .apache .lucene .store .DataOutput ;
4141import org .apache .lucene .store .Directory ;
4646import org .apache .lucene .store .TrackingDirectoryWrapper ;
4747import org .apache .lucene .util .ArrayUtil ;
4848import org .apache .lucene .util .BytesRef ;
49- import org .apache .lucene .util .BytesRefComparator ;
5049import org .apache .lucene .util .CloseableThreadLocal ;
5150import org .apache .lucene .util .IOUtils ;
5251import org .apache .lucene .util .IntroSorter ;
5352import org .apache .lucene .util .IntsRef ;
54- import org .apache .lucene .util .OfflineSorter ;
55- import org .apache .lucene .util .OfflineSorter .BufferSize ;
53+ import org .apache .lucene .util .packed .PackedInts ;
5654
5755/**
5856 * Implementation of "recursive graph bisection", also called "bipartite graph partitioning" and
@@ -654,9 +652,7 @@ private int writePostings(
654652 for (int doc = postings .nextDoc ();
655653 doc != DocIdSetIterator .NO_MORE_DOCS ;
656654 doc = postings .nextDoc ()) {
657- // reverse bytes so that byte order matches natural order
658- postingsOut .writeInt (Integer .reverseBytes (doc ));
659- postingsOut .writeInt (Integer .reverseBytes (termID ));
655+ postingsOut .writeLong (Integer .toUnsignedLong (termID ) << 32 | Integer .toUnsignedLong (doc ));
660656 }
661657 }
662658 }
@@ -665,107 +661,60 @@ private int writePostings(
665661
666662 private ForwardIndex buildForwardIndex (
667663 Directory tempDir , String postingsFileName , int maxDoc , int maxTerm ) throws IOException {
668- String sortedPostingsFile =
669- new OfflineSorter (
670- tempDir ,
671- "forward-index" ,
672- // Implement BytesRefComparator to make OfflineSorter use radix sort
673- new BytesRefComparator (2 * Integer .BYTES ) {
674- @ Override
675- protected int byteAt (BytesRef ref , int i ) {
676- return ref .bytes [ref .offset + i ] & 0xFF ;
677- }
678-
679- @ Override
680- public int compare (BytesRef o1 , BytesRef o2 , int k ) {
681- assert o1 .length == 2 * Integer .BYTES ;
682- assert o2 .length == 2 * Integer .BYTES ;
683- return ArrayUtil .compareUnsigned8 (o1 .bytes , o1 .offset , o2 .bytes , o2 .offset );
684- }
685- },
686- BufferSize .megabytes ((long ) (ramBudgetMB / getParallelism ())),
687- OfflineSorter .MAX_TEMPFILES ,
688- 2 * Integer .BYTES ,
689- forkJoinPool ,
690- getParallelism ()) {
691-
692- @ Override
693- protected ByteSequencesReader getReader (ChecksumIndexInput in , String name )
694- throws IOException {
695- return new ByteSequencesReader (in , postingsFileName ) {
696- {
697- ref .grow (2 * Integer .BYTES );
698- ref .setLength (2 * Integer .BYTES );
699- }
700-
701- @ Override
702- public BytesRef next () throws IOException {
703- if (in .getFilePointer () >= end ) {
704- return null ;
705- }
706- // optimized read of 8 bytes
707- in .readBytes (ref .bytes (), 0 , 2 * Integer .BYTES );
708- return ref .get ();
709- }
710- };
711- }
712-
713- @ Override
714- protected ByteSequencesWriter getWriter (IndexOutput out , long itemCount )
715- throws IOException {
716- return new ByteSequencesWriter (out ) {
717- @ Override
718- public void write (byte [] bytes , int off , int len ) throws IOException {
719- assert len == 2 * Integer .BYTES ;
720- // optimized read of 8 bytes
721- out .writeBytes (bytes , off , len );
722- }
723- };
724- }
725- }.sort (postingsFileName );
726664
727665 String termIDsFileName ;
728666 String startOffsetsFileName ;
729- int prevDoc = -1 ;
730- try (IndexInput sortedPostings = tempDir .openInput (sortedPostingsFile , IOContext .READONCE );
731- IndexOutput termIDs = tempDir .createTempOutput ("term-ids" , "" , IOContext .DEFAULT );
667+ try (IndexOutput termIDs = tempDir .createTempOutput ("term-ids" , "" , IOContext .DEFAULT );
732668 IndexOutput startOffsets =
733669 tempDir .createTempOutput ("start-offsets" , "" , IOContext .DEFAULT )) {
734670 termIDsFileName = termIDs .getName ();
735671 startOffsetsFileName = startOffsets .getName ();
736- final long end = sortedPostings .length () - CodecUtil .footerLength ();
737672 int [] buffer = new int [TERM_IDS_BLOCK_SIZE ];
738- int bufferLen = 0 ;
739- while (sortedPostings .getFilePointer () < end ) {
740- final int doc = Integer .reverseBytes (sortedPostings .readInt ());
741- final int termID = Integer .reverseBytes (sortedPostings .readInt ());
742- if (doc != prevDoc ) {
743- if (bufferLen != 0 ) {
744- writeMonotonicInts (buffer , bufferLen , termIDs );
745- bufferLen = 0 ;
746- }
673+ new ForwardIndexSorter (tempDir )
674+ .sortAndConsume (
675+ postingsFileName ,
676+ maxDoc ,
677+ new LongConsumer () {
678+
679+ int prevDoc = -1 ;
680+ int bufferLen = 0 ;
681+
682+ @ Override
683+ public void accept (long value ) throws IOException {
684+ int doc = (int ) value ;
685+ int termID = (int ) (value >>> 32 );
686+ if (doc != prevDoc ) {
687+ if (bufferLen != 0 ) {
688+ writeMonotonicInts (buffer , bufferLen , termIDs );
689+ bufferLen = 0 ;
690+ }
691+
692+ assert doc > prevDoc ;
693+ for (int d = prevDoc + 1 ; d <= doc ; ++d ) {
694+ startOffsets .writeLong (termIDs .getFilePointer ());
695+ }
696+ prevDoc = doc ;
697+ }
698+ assert termID < maxTerm : termID + " " + maxTerm ;
699+ if (bufferLen == buffer .length ) {
700+ writeMonotonicInts (buffer , bufferLen , termIDs );
701+ bufferLen = 0 ;
702+ }
703+ buffer [bufferLen ++] = termID ;
704+ }
747705
748- assert doc > prevDoc ;
749- for (int d = prevDoc + 1 ; d <= doc ; ++d ) {
750- startOffsets .writeLong (termIDs .getFilePointer ());
751- }
752- prevDoc = doc ;
753- }
754- assert termID < maxTerm : termID + " " + maxTerm ;
755- if (bufferLen == buffer .length ) {
756- writeMonotonicInts (buffer , bufferLen , termIDs );
757- bufferLen = 0 ;
758- }
759- buffer [bufferLen ++] = termID ;
760- }
761- if (bufferLen != 0 ) {
762- writeMonotonicInts (buffer , bufferLen , termIDs );
763- }
764- for (int d = prevDoc + 1 ; d <= maxDoc ; ++d ) {
765- startOffsets .writeLong (termIDs .getFilePointer ());
766- }
767- CodecUtil .writeFooter (termIDs );
768- CodecUtil .writeFooter (startOffsets );
706+ @ Override
707+ public void onFinish () throws IOException {
708+ if (bufferLen != 0 ) {
709+ writeMonotonicInts (buffer , bufferLen , termIDs );
710+ }
711+ for (int d = prevDoc + 1 ; d <= maxDoc ; ++d ) {
712+ startOffsets .writeLong (termIDs .getFilePointer ());
713+ }
714+ CodecUtil .writeFooter (termIDs );
715+ CodecUtil .writeFooter (startOffsets );
716+ }
717+ });
769718 }
770719
771720 IndexInput termIDsInput = tempDir .openInput (termIDsFileName , IOContext .READ );
@@ -991,4 +940,169 @@ static int readMonotonicInts(DataInput in, int[] ints) throws IOException {
991940 }
992941 return len ;
993942 }
943+
944+ /**
945+ * Use a LSB Radix Sorter to sort the (docID, termID) entries. We only need to compare docIds
946+ * because LSB Radix Sorter is stable and termIDs already sorted.
947+ *
948+ * <p>This sorter will require at least 16MB ({@link #BUFFER_BYTES} * {@link #HISTOGRAM_SIZE})
949+ * RAM.
950+ */
951+ static class ForwardIndexSorter {
952+
953+ private static final int HISTOGRAM_SIZE = 256 ;
954+ private static final int BUFFER_SIZE = 8192 ;
955+ private static final int BUFFER_BYTES = BUFFER_SIZE * Long .BYTES ;
956+ private final Directory directory ;
957+ private final Bucket [] buckets = new Bucket [HISTOGRAM_SIZE ];
958+
959+ private static class Bucket {
960+ private final ByteBuffersDataOutput fps = new ByteBuffersDataOutput ();
961+ private final long [] buffer = new long [BUFFER_SIZE ];
962+ private IndexOutput output ;
963+ private int bufferUsed ;
964+ private int blockNum ;
965+ private long lastFp ;
966+ private int finalBlockSize ;
967+
968+ private void addEntry (long l ) throws IOException {
969+ buffer [bufferUsed ++] = l ;
970+ if (bufferUsed == BUFFER_SIZE ) {
971+ flush (false );
972+ }
973+ }
974+
975+ private void flush (boolean isFinal ) throws IOException {
976+ if (isFinal ) {
977+ finalBlockSize = bufferUsed ;
978+ }
979+ long fp = output .getFilePointer ();
980+ fps .writeVLong (encode (fp - lastFp ));
981+ lastFp = fp ;
982+ for (int i = 0 ; i < bufferUsed ; i ++) {
983+ output .writeLong (buffer [i ]);
984+ }
985+ lastFp = fp ;
986+ blockNum ++;
987+ bufferUsed = 0 ;
988+ }
989+
990+ private void reset (IndexOutput resetOutput ) {
991+ output = resetOutput ;
992+ finalBlockSize = 0 ;
993+ bufferUsed = 0 ;
994+ blockNum = 0 ;
995+ lastFp = 0 ;
996+ fps .reset ();
997+ }
998+ }
999+
1000+ private static long encode (long fpDelta ) {
1001+ assert (fpDelta & 0x07 ) == 0 : "fpDelta should be multiple of 8" ;
1002+ if (fpDelta % BUFFER_BYTES == 0 ) {
1003+ return ((fpDelta / BUFFER_BYTES ) << 1 ) | 1 ;
1004+ } else {
1005+ return fpDelta ;
1006+ }
1007+ }
1008+
1009+ private static long decode (long fpDelta ) {
1010+ if ((fpDelta & 1 ) == 1 ) {
1011+ return (fpDelta >>> 1 ) * BUFFER_BYTES ;
1012+ } else {
1013+ return fpDelta ;
1014+ }
1015+ }
1016+
1017+ ForwardIndexSorter (Directory directory ) {
1018+ this .directory = directory ;
1019+ for (int i = 0 ; i < HISTOGRAM_SIZE ; i ++) {
1020+ buckets [i ] = new Bucket ();
1021+ }
1022+ }
1023+
1024+ private void consume (String fileName , LongConsumer consumer ) throws IOException {
1025+ try (IndexInput in = directory .openInput (fileName , IOContext .READONCE )) {
1026+ final long end = in .length () - CodecUtil .footerLength ();
1027+ while (in .getFilePointer () < end ) {
1028+ consumer .accept (in .readLong ());
1029+ }
1030+ }
1031+ consumer .onFinish ();
1032+ }
1033+
1034+ private void consume (String fileName , long indexFP , LongConsumer consumer ) throws IOException {
1035+ try (IndexInput index = directory .openInput (fileName , IOContext .READONCE );
1036+ IndexInput value = directory .openInput (fileName , IOContext .READONCE )) {
1037+ index .seek (indexFP );
1038+ for (int i = 0 ; i < buckets .length ; i ++) {
1039+ int blockNum = index .readVInt ();
1040+ int finalBlockSize = index .readVInt ();
1041+ long fp = decode (index .readVLong ());
1042+ for (int block = 0 ; block < blockNum - 1 ; block ++) {
1043+ value .seek (fp );
1044+ for (int j = 0 ; j < BUFFER_SIZE ; j ++) {
1045+ consumer .accept (value .readLong ());
1046+ }
1047+ fp += decode (index .readVLong ());
1048+ }
1049+ value .seek (fp );
1050+ for (int j = 0 ; j < finalBlockSize ; j ++) {
1051+ consumer .accept (value .readLong ());
1052+ }
1053+ }
1054+ consumer .onFinish ();
1055+ }
1056+ }
1057+
1058+ private LongConsumer consumer (int shift ) {
1059+ return new LongConsumer () {
1060+ @ Override
1061+ public void accept (long value ) throws IOException {
1062+ int b = (int ) ((value >>> shift ) & 0xFF );
1063+ Bucket bucket = buckets [b ];
1064+ bucket .addEntry (value );
1065+ }
1066+
1067+ @ Override
1068+ public void onFinish () throws IOException {
1069+ for (Bucket bucket : buckets ) {
1070+ bucket .flush (true );
1071+ }
1072+ }
1073+ };
1074+ }
1075+
1076+ void sortAndConsume (String fileName , int maxDoc , LongConsumer consumer ) throws IOException {
1077+ int bitsRequired = PackedInts .bitsRequired (maxDoc );
1078+ String sourceFileName = fileName ;
1079+ long indexFP = -1 ;
1080+ for (int shift = 0 ; shift < bitsRequired ; shift += 8 ) {
1081+ try (IndexOutput output = directory .createTempOutput (fileName , "sort" , IOContext .DEFAULT )) {
1082+ Arrays .stream (buckets ).forEach (b -> b .reset (output ));
1083+ if (shift == 0 ) {
1084+ consume (sourceFileName , consumer (shift ));
1085+ } else {
1086+ consume (sourceFileName , indexFP , consumer (shift ));
1087+ directory .deleteFile (sourceFileName );
1088+ }
1089+ indexFP = output .getFilePointer ();
1090+ for (Bucket bucket : buckets ) {
1091+ output .writeVInt (bucket .blockNum );
1092+ output .writeVInt (bucket .finalBlockSize );
1093+ bucket .fps .copyTo (output );
1094+ }
1095+ CodecUtil .writeFooter (output );
1096+ sourceFileName = output .getName ();
1097+ }
1098+ }
1099+ consume (sourceFileName , indexFP , consumer );
1100+ }
1101+ }
1102+
1103+ interface LongConsumer {
1104+ void accept (long value ) throws IOException ;
1105+
1106+ default void onFinish () throws IOException {}
1107+ }
9941108}
0 commit comments