@@ -92,19 +92,25 @@ LongValues buildAndWritePostingsLists(
9292 fieldInfo .getVectorDimension (),
9393 new OptimizedScalarQuantizer (fieldInfo .getVectorSimilarityFunction ())
9494 );
95+ final ByteBuffer buffer = ByteBuffer .allocate (fieldInfo .getVectorDimension () * Float .BYTES ).order (ByteOrder .LITTLE_ENDIAN );
9596 for (int c = 0 ; c < centroidSupplier .size (); c ++) {
9697 float [] centroid = centroidSupplier .centroid (c );
9798 int [] cluster = assignmentsByCluster [c ];
98- // TODO align???
99- offsets .add (postingsOutput .getFilePointer ());
99+ offsets .add (postingsOutput .alignFilePointer (Float .BYTES ));
100+ buffer .asFloatBuffer ().put (centroid );
101+ // write raw centroid for quantizing the query vectors
102+ postingsOutput .writeBytes (buffer .array (), buffer .array ().length );
103+ // write centroid dot product for quantizing the query vectors
104+ postingsOutput .writeInt (Float .floatToIntBits (VectorUtil .dotProduct (centroid , centroid )));
100105 int size = cluster .length ;
106+ // write docIds
101107 postingsOutput .writeVInt (size );
102- postingsOutput .writeInt (Float .floatToIntBits (VectorUtil .dotProduct (centroid , centroid )));
103108 onHeapQuantizedVectors .reset (centroid , size , ord -> cluster [ord ]);
104109 // TODO we might want to consider putting the docIds in a separate file
105110 // to aid with only having to fetch vectors from slower storage when they are required
106111 // keeping them in the same file indicates we pull the entire file into cache
107112 docIdsWriter .writeDocIds (j -> floatVectorValues .ordToDoc (cluster [j ]), size , postingsOutput );
113+ // write vectors
108114 bulkWriter .writeVectors (onHeapQuantizedVectors );
109115 }
110116
@@ -209,20 +215,26 @@ LongValues buildAndWritePostingsLists(
209215 );
210216 DocIdsWriter docIdsWriter = new DocIdsWriter ();
211217 DiskBBQBulkWriter bulkWriter = new DiskBBQBulkWriter .OneBitDiskBBQBulkWriter (ES91OSQVectorsScorer .BULK_SIZE , postingsOutput );
218+ final ByteBuffer buffer = ByteBuffer .allocate (fieldInfo .getVectorDimension () * Float .BYTES ).order (ByteOrder .LITTLE_ENDIAN );
212219 for (int c = 0 ; c < centroidSupplier .size (); c ++) {
213220 float [] centroid = centroidSupplier .centroid (c );
214221 int [] cluster = assignmentsByCluster [c ];
215222 boolean [] isOverspill = isOverspillByCluster [c ];
216- offsets .add (postingsOutput .getFilePointer ());
223+ offsets .add (postingsOutput .alignFilePointer (Float .BYTES ));
224+ // write raw centroid for quantizing the query vectors
225+ buffer .asFloatBuffer ().put (centroid );
226+ postingsOutput .writeBytes (buffer .array (), buffer .array ().length );
227+ // write centroid dot product for quantizing the query vectors
228+ postingsOutput .writeInt (Float .floatToIntBits (VectorUtil .dotProduct (centroid , centroid )));
229+ // write docIds
217230 int size = cluster .length ;
218- // TODO align???
219231 postingsOutput .writeVInt (size );
220- postingsOutput .writeInt (Float .floatToIntBits (VectorUtil .dotProduct (centroid , centroid )));
221232 offHeapQuantizedVectors .reset (size , ord -> isOverspill [ord ], ord -> cluster [ord ]);
222233 // TODO we might want to consider putting the docIds in a separate file
223234 // to aid with only having to fetch vectors from slower storage when they are required
224235 // keeping them in the same file indicates we pull the entire file into cache
225236 docIdsWriter .writeDocIds (j -> floatVectorValues .ordToDoc (cluster [j ]), size , postingsOutput );
237+ // write vectors
226238 bulkWriter .writeVectors (offHeapQuantizedVectors );
227239 }
228240
@@ -298,13 +310,8 @@ void writeCentroids(
298310 }
299311 writeQuantizedValue (centroidOutput , quantized , result );
300312 }
301- final ByteBuffer buffer = ByteBuffer . allocate ( fieldInfo . getVectorDimension () * Float . BYTES ). order ( ByteOrder . LITTLE_ENDIAN );
313+ // write the centroid offsets at the end of the file
302314 for (int i = 0 ; i < centroidSupplier .size (); i ++) {
303- float [] centroid = centroidSupplier .centroid (i );
304- buffer .asFloatBuffer ().put (centroid );
305- // write the centroids
306- centroidOutput .writeBytes (buffer .array (), buffer .array ().length );
307- // write the offset of this posting list
308315 centroidOutput .writeLong (offsets .get (i ));
309316 }
310317 }
0 commit comments