1818import org .apache .lucene .store .IndexInput ;
1919import org .apache .lucene .store .IndexOutput ;
2020import org .apache .lucene .util .IntroSorter ;
21- import org .apache .lucene .util .LongValues ;
2221import org .apache .lucene .util .VectorUtil ;
2322import org .apache .lucene .util .hnsw .IntToIntFunction ;
2423import org .apache .lucene .util .packed .PackedInts ;
@@ -60,7 +59,7 @@ public DefaultIVFVectorsWriter(
6059 }
6160
6261 @ Override
63- LongValues buildAndWritePostingsLists (
62+ CentroidOffsetAndLength buildAndWritePostingsLists (
6463 FieldInfo fieldInfo ,
6564 CentroidSupplier centroidSupplier ,
6665 FloatVectorValues floatVectorValues ,
@@ -102,6 +101,7 @@ LongValues buildAndWritePostingsLists(
102101 postingsOutput .writeVInt (maxPostingListSize );
103102 // write the posting lists
104103 final PackedLongValues .Builder offsets = PackedLongValues .monotonicBuilder (PackedInts .COMPACT );
104+ final PackedLongValues .Builder lengths = PackedLongValues .monotonicBuilder (PackedInts .COMPACT );
105105 DiskBBQBulkWriter bulkWriter = new DiskBBQBulkWriter .OneBitDiskBBQBulkWriter (ES91OSQVectorsScorer .BULK_SIZE , postingsOutput );
106106 OnHeapQuantizedVectors onHeapQuantizedVectors = new OnHeapQuantizedVectors (
107107 floatVectorValues ,
@@ -116,7 +116,8 @@ LongValues buildAndWritePostingsLists(
116116 for (int c = 0 ; c < centroidSupplier .size (); c ++) {
117117 float [] centroid = centroidSupplier .centroid (c );
118118 int [] cluster = assignmentsByCluster [c ];
119- offsets .add (postingsOutput .alignFilePointer (Float .BYTES ) - fileOffset );
119+ long offset = postingsOutput .alignFilePointer (Float .BYTES ) - fileOffset ;
120+ offsets .add (offset );
120121 buffer .asFloatBuffer ().put (centroid );
121122 // write raw centroid for quantizing the query vectors
122123 postingsOutput .writeBytes (buffer .array (), buffer .array ().length );
@@ -142,17 +143,18 @@ LongValues buildAndWritePostingsLists(
142143 idsWriter .writeDocIds (i -> docDeltas [i ], size , postingsOutput );
143144 // write vectors
144145 bulkWriter .writeVectors (onHeapQuantizedVectors );
146+ lengths .add (postingsOutput .getFilePointer () - fileOffset - offset );
145147 }
146148
147149 if (logger .isDebugEnabled ()) {
148150 printClusterQualityStatistics (assignmentsByCluster );
149151 }
150152
151- return offsets .build ();
153+ return new CentroidOffsetAndLength ( offsets .build (), lengths . build () );
152154 }
153155
154156 @ Override
155- LongValues buildAndWritePostingsLists (
157+ CentroidOffsetAndLength buildAndWritePostingsLists (
156158 FieldInfo fieldInfo ,
157159 CentroidSupplier centroidSupplier ,
158160 FloatVectorValues floatVectorValues ,
@@ -243,6 +245,7 @@ LongValues buildAndWritePostingsLists(
243245 // now we can read the quantized vectors from the temporary file
244246 try (IndexInput quantizedVectorsInput = mergeState .segmentInfo .dir .openInput (quantizedVectorsTempName , IOContext .DEFAULT )) {
245247 final PackedLongValues .Builder offsets = PackedLongValues .monotonicBuilder (PackedInts .COMPACT );
248+ final PackedLongValues .Builder lengths = PackedLongValues .monotonicBuilder (PackedInts .COMPACT );
246249 OffHeapQuantizedVectors offHeapQuantizedVectors = new OffHeapQuantizedVectors (
247250 quantizedVectorsInput ,
248251 fieldInfo .getVectorDimension ()
@@ -260,7 +263,8 @@ LongValues buildAndWritePostingsLists(
260263 float [] centroid = centroidSupplier .centroid (c );
261264 int [] cluster = assignmentsByCluster [c ];
262265 boolean [] isOverspill = isOverspillByCluster [c ];
263- offsets .add (postingsOutput .alignFilePointer (Float .BYTES ) - fileOffset );
266+ long offset = postingsOutput .alignFilePointer (Float .BYTES ) - fileOffset ;
267+ offsets .add (offset );
264268 // write raw centroid for quantizing the query vectors
265269 buffer .asFloatBuffer ().put (centroid );
266270 postingsOutput .writeBytes (buffer .array (), buffer .array ().length );
@@ -286,12 +290,14 @@ LongValues buildAndWritePostingsLists(
286290 idsWriter .writeDocIds (i -> docDeltas [i ], size , postingsOutput );
287291 // write vectors
288292 bulkWriter .writeVectors (offHeapQuantizedVectors );
293+ lengths .add (postingsOutput .getFilePointer () - fileOffset - offset );
294+ // lengths.add(1);
289295 }
290296
291297 if (logger .isDebugEnabled ()) {
292298 printClusterQualityStatistics (assignmentsByCluster );
293299 }
294- return offsets .build ();
300+ return new CentroidOffsetAndLength ( offsets .build (), lengths . build () );
295301 }
296302 }
297303
@@ -335,24 +341,24 @@ void writeCentroids(
335341 FieldInfo fieldInfo ,
336342 CentroidSupplier centroidSupplier ,
337343 float [] globalCentroid ,
338- LongValues offsets ,
344+ CentroidOffsetAndLength centroidOffsetAndLength ,
339345 IndexOutput centroidOutput
340346 ) throws IOException {
341347 // TODO do we want to store these distances as well for future use?
342348 // TODO: sort centroids by global centroid (was doing so previously here)
343349 // TODO: sorting tanks recall possibly because centroids ordinals no longer are aligned
344350 if (centroidSupplier .size () > centroidsPerParentCluster * centroidsPerParentCluster ) {
345- writeCentroidsWithParents (fieldInfo , centroidSupplier , globalCentroid , offsets , centroidOutput );
351+ writeCentroidsWithParents (fieldInfo , centroidSupplier , globalCentroid , centroidOffsetAndLength , centroidOutput );
346352 } else {
347- writeCentroidsWithoutParents (fieldInfo , centroidSupplier , globalCentroid , offsets , centroidOutput );
353+ writeCentroidsWithoutParents (fieldInfo , centroidSupplier , globalCentroid , centroidOffsetAndLength , centroidOutput );
348354 }
349355 }
350356
351357 private void writeCentroidsWithParents (
352358 FieldInfo fieldInfo ,
353359 CentroidSupplier centroidSupplier ,
354360 float [] globalCentroid ,
355- LongValues offsets ,
361+ CentroidOffsetAndLength centroidOffsetAndLength ,
356362 IndexOutput centroidOutput
357363 ) throws IOException {
358364 DiskBBQBulkWriter .SevenBitDiskBBQBulkWriter bulkWriter = new DiskBBQBulkWriter .SevenBitDiskBBQBulkWriter (
@@ -392,7 +398,8 @@ private void writeCentroidsWithParents(
392398 for (int i = 0 ; i < centroidGroups .centroids ().length ; i ++) {
393399 final int [] centroidAssignments = centroidGroups .vectors ()[i ];
394400 for (int assignment : centroidAssignments ) {
395- centroidOutput .writeLong (offsets .get (assignment ));
401+ centroidOutput .writeLong (centroidOffsetAndLength .offsets ().get (assignment ));
402+ centroidOutput .writeLong (centroidOffsetAndLength .lengths ().get (assignment ));
396403 }
397404 }
398405 }
@@ -401,7 +408,7 @@ private void writeCentroidsWithoutParents(
401408 FieldInfo fieldInfo ,
402409 CentroidSupplier centroidSupplier ,
403410 float [] globalCentroid ,
404- LongValues offsets ,
411+ CentroidOffsetAndLength centroidOffsetAndLength ,
405412 IndexOutput centroidOutput
406413 ) throws IOException {
407414 centroidOutput .writeVInt (0 );
@@ -419,7 +426,8 @@ private void writeCentroidsWithoutParents(
419426 bulkWriter .writeVectors (quantizedCentroids );
420427 // write the centroid offsets at the end of the file
421428 for (int i = 0 ; i < centroidSupplier .size (); i ++) {
422- centroidOutput .writeLong (offsets .get (i ));
429+ centroidOutput .writeLong (centroidOffsetAndLength .offsets ().get (i ));
430+ centroidOutput .writeLong (centroidOffsetAndLength .lengths ().get (i ));
423431 }
424432 }
425433
0 commit comments