1717import org .apache .lucene .store .IOContext ;
1818import org .apache .lucene .store .IndexInput ;
1919import org .apache .lucene .store .IndexOutput ;
20+ import org .apache .lucene .util .IntroSorter ;
2021import org .apache .lucene .util .LongValues ;
2122import org .apache .lucene .util .VectorUtil ;
2223import org .apache .lucene .util .hnsw .IntToIntFunction ;
@@ -101,14 +102,16 @@ LongValues buildAndWritePostingsLists(
101102 postingsOutput .writeVInt (maxPostingListSize );
102103 // write the posting lists
103104 final PackedLongValues .Builder offsets = PackedLongValues .monotonicBuilder (PackedInts .COMPACT );
104- DocIdsWriter docIdsWriter = new DocIdsWriter ();
105105 DiskBBQBulkWriter bulkWriter = new DiskBBQBulkWriter .OneBitDiskBBQBulkWriter (ES91OSQVectorsScorer .BULK_SIZE , postingsOutput );
106106 OnHeapQuantizedVectors onHeapQuantizedVectors = new OnHeapQuantizedVectors (
107107 floatVectorValues ,
108108 fieldInfo .getVectorDimension (),
109109 new OptimizedScalarQuantizer (fieldInfo .getVectorSimilarityFunction ())
110110 );
111111 final ByteBuffer buffer = ByteBuffer .allocate (fieldInfo .getVectorDimension () * Float .BYTES ).order (ByteOrder .LITTLE_ENDIAN );
112+ int [] docIds = null ;
113+ int [] docDeltas = null ;
114+ int [] clusterOrds = null ;
112115 for (int c = 0 ; c < centroidSupplier .size (); c ++) {
113116 float [] centroid = centroidSupplier .centroid (c );
114117 int [] cluster = assignmentsByCluster [c ];
@@ -121,11 +124,28 @@ LongValues buildAndWritePostingsLists(
121124 int size = cluster .length ;
122125 // write docIds
123126 postingsOutput .writeVInt (size );
124- onHeapQuantizedVectors .reset (centroid , size , ord -> cluster [ord ]);
127+ if (docIds == null || docIds .length < cluster .length ) {
128+ docIds = new int [cluster .length ];
129+ clusterOrds = new int [cluster .length ];
130+ docDeltas = new int [cluster .length ];
131+ }
132+ for (int j = 0 ; j < size ; j ++) {
133+ docIds [j ] = floatVectorValues .ordToDoc (cluster [j ]);
134+ clusterOrds [j ] = j ;
135+ }
136+ final int [] finalDocs = docIds ;
137+ final int [] finalOrds = clusterOrds ;
138+ // sort cluster.buffer by docIds values, this way cluster ordinals are sorted by docIds
139+ new IntSorter (clusterOrds , i -> finalDocs [i ]).sort (0 , size );
140+ // encode doc deltas
141+ for (int j = 0 ; j < size ; j ++) {
142+ docDeltas [j ] = j == 0 ? finalDocs [finalOrds [j ]] : finalDocs [finalOrds [j ]] - finalDocs [finalOrds [j - 1 ]];
143+ }
144+ onHeapQuantizedVectors .reset (centroid , size , ord -> cluster [finalOrds [ord ]]);
125145 // TODO we might want to consider putting the docIds in a separate file
126146 // to aid with only having to fetch vectors from slower storage when they are required
127147 // keeping them in the same file indicates we pull the entire file into cache
128- docIdsWriter . writeDocIds ( j -> floatVectorValues . ordToDoc ( cluster [ j ]) , size , postingsOutput );
148+ postingsOutput . writeGroupVInts ( docDeltas , size );
129149 // write vectors
130150 bulkWriter .writeVectors (onHeapQuantizedVectors );
131151 }
@@ -233,12 +253,14 @@ LongValues buildAndWritePostingsLists(
233253 quantizedVectorsInput ,
234254 fieldInfo .getVectorDimension ()
235255 );
236- DocIdsWriter docIdsWriter = new DocIdsWriter ();
237256 DiskBBQBulkWriter bulkWriter = new DiskBBQBulkWriter .OneBitDiskBBQBulkWriter (ES91OSQVectorsScorer .BULK_SIZE , postingsOutput );
238257 final ByteBuffer buffer = ByteBuffer .allocate (fieldInfo .getVectorDimension () * Float .BYTES ).order (ByteOrder .LITTLE_ENDIAN );
239258 // write the max posting list size
240259 postingsOutput .writeVInt (maxPostingListSize );
241260 // write the posting lists
261+ int [] docIds = null ;
262+ int [] docDeltas = null ;
263+ int [] clusterOrds = null ;
242264 for (int c = 0 ; c < centroidSupplier .size (); c ++) {
243265 float [] centroid = centroidSupplier .centroid (c );
244266 int [] cluster = assignmentsByCluster [c ];
@@ -252,11 +274,28 @@ LongValues buildAndWritePostingsLists(
252274 // write docIds
253275 int size = cluster .length ;
254276 postingsOutput .writeVInt (size );
255- offHeapQuantizedVectors .reset (size , ord -> isOverspill [ord ], ord -> cluster [ord ]);
277+ if (docIds == null || docIds .length < cluster .length ) {
278+ docIds = new int [cluster .length ];
279+ clusterOrds = new int [cluster .length ];
280+ docDeltas = new int [cluster .length ];
281+ }
282+ for (int j = 0 ; j < size ; j ++) {
283+ docIds [j ] = floatVectorValues .ordToDoc (cluster [j ]);
284+ clusterOrds [j ] = j ;
285+ }
286+ final int [] finalDocs = docIds ;
287+ final int [] finalOrds = clusterOrds ;
288+ // sort cluster.buffer by docIds values, this way cluster ordinals are sorted by docIds
289+ new IntSorter (clusterOrds , i -> finalDocs [i ]).sort (0 , size );
290+ // encode doc deltas
291+ for (int j = 0 ; j < size ; j ++) {
292+ docDeltas [j ] = j == 0 ? finalDocs [finalOrds [j ]] : finalDocs [finalOrds [j ]] - finalDocs [finalOrds [j - 1 ]];
293+ }
294+ offHeapQuantizedVectors .reset (size , ord -> isOverspill [finalOrds [ord ]], ord -> cluster [finalOrds [ord ]]);
256295 // TODO we might want to consider putting the docIds in a separate file
257296 // to aid with only having to fetch vectors from slower storage when they are required
258297 // keeping them in the same file indicates we pull the entire file into cache
259- docIdsWriter . writeDocIds ( j -> floatVectorValues . ordToDoc ( cluster [ j ]) , size , postingsOutput );
298+ postingsOutput . writeGroupVInts ( docDeltas , size );
260299 // write vectors
261300 bulkWriter .writeVectors (offHeapQuantizedVectors );
262301 }
@@ -717,4 +756,37 @@ public void readQuantizedVector(int ord, boolean isOverspill) throws IOException
717756 bitSum = quantizedVectorsInput .readShort ();
718757 }
719758 }
759+
760+ private static class IntSorter extends IntroSorter {
761+ int pivot = -1 ;
762+ private final int [] arr ;
763+ private final IntToIntFunction func ;
764+
765+ private IntSorter (int [] arr , IntToIntFunction func ) {
766+ this .arr = arr ;
767+ this .func = func ;
768+ }
769+
770+ @ Override
771+ protected void setPivot (int i ) {
772+ pivot = func .apply (arr [i ]);
773+ }
774+
775+ @ Override
776+ protected int comparePivot (int j ) {
777+ return Integer .compare (pivot , func .apply (arr [j ]));
778+ }
779+
780+ @ Override
781+ protected int compare (int a , int b ) {
782+ return Integer .compare (func .apply (arr [a ]), func .apply (arr [b ]));
783+ }
784+
785+ @ Override
786+ protected void swap (int i , int j ) {
787+ final int tmp = arr [i ];
788+ arr [i ] = arr [j ];
789+ arr [j ] = tmp ;
790+ }
791+ }
720792}
0 commit comments