@@ -89,19 +89,25 @@ long[] buildAndWritePostingsLists(
8989 fieldInfo .getVectorDimension (),
9090 new OptimizedScalarQuantizer (fieldInfo .getVectorSimilarityFunction ())
9191 );
92+ final ByteBuffer buffer = ByteBuffer .allocate (fieldInfo .getVectorDimension () * Float .BYTES ).order (ByteOrder .LITTLE_ENDIAN );
9293 for (int c = 0 ; c < centroidSupplier .size (); c ++) {
9394 float [] centroid = centroidSupplier .centroid (c );
9495 int [] cluster = assignmentsByCluster [c ];
95- // TODO align???
96- offsets [c ] = postingsOutput .getFilePointer ();
96+ offsets [c ] = postingsOutput .alignFilePointer (Float .BYTES );
97+ buffer .asFloatBuffer ().put (centroid );
98+ // write raw centroid for quantizing the query vectors
99+ postingsOutput .writeBytes (buffer .array (), buffer .array ().length );
100+ // write centroid dot product for quantizing the query vectors
101+ postingsOutput .writeInt (Float .floatToIntBits (VectorUtil .dotProduct (centroid , centroid )));
97102 int size = cluster .length ;
103+ // write docIds
98104 postingsOutput .writeVInt (size );
99- postingsOutput .writeInt (Float .floatToIntBits (VectorUtil .dotProduct (centroid , centroid )));
100105 onHeapQuantizedVectors .reset (centroid , size , ord -> cluster [ord ]);
101106 // TODO we might want to consider putting the docIds in a separate file
102107 // to aid with only having to fetch vectors from slower storage when they are required
103108 // keeping them in the same file indicates we pull the entire file into cache
104109 docIdsWriter .writeDocIds (j -> floatVectorValues .ordToDoc (cluster [j ]), size , postingsOutput );
110+ // write vectors
105111 bulkWriter .writeVectors (onHeapQuantizedVectors );
106112 }
107113
@@ -206,20 +212,26 @@ long[] buildAndWritePostingsLists(
206212 );
207213 DocIdsWriter docIdsWriter = new DocIdsWriter ();
208214 DiskBBQBulkWriter bulkWriter = new DiskBBQBulkWriter .OneBitDiskBBQBulkWriter (ES91OSQVectorsScorer .BULK_SIZE , postingsOutput );
215+ final ByteBuffer buffer = ByteBuffer .allocate (fieldInfo .getVectorDimension () * Float .BYTES ).order (ByteOrder .LITTLE_ENDIAN );
209216 for (int c = 0 ; c < centroidSupplier .size (); c ++) {
210217 float [] centroid = centroidSupplier .centroid (c );
211218 int [] cluster = assignmentsByCluster [c ];
212219 boolean [] isOverspill = isOverspillByCluster [c ];
213- // TODO align???
214- offsets [c ] = postingsOutput .getFilePointer ();
220+ offsets [c ] = postingsOutput .alignFilePointer (Float .BYTES );
221+ // write raw centroid for quantizing the query vectors
222+ buffer .asFloatBuffer ().put (centroid );
223+ postingsOutput .writeBytes (buffer .array (), buffer .array ().length );
224+ // write centroid dot product for quantizing the query vectors
225+ postingsOutput .writeInt (Float .floatToIntBits (VectorUtil .dotProduct (centroid , centroid )));
226+ // write docIds
215227 int size = cluster .length ;
216228 postingsOutput .writeVInt (size );
217- postingsOutput .writeInt (Float .floatToIntBits (VectorUtil .dotProduct (centroid , centroid )));
218229 offHeapQuantizedVectors .reset (size , ord -> isOverspill [ord ], ord -> cluster [ord ]);
219230 // TODO we might want to consider putting the docIds in a separate file
220231 // to aid with only having to fetch vectors from slower storage when they are required
221232 // keeping them in the same file indicates we pull the entire file into cache
222233 docIdsWriter .writeDocIds (j -> floatVectorValues .ordToDoc (cluster [j ]), size , postingsOutput );
234+ // write vectors
223235 bulkWriter .writeVectors (offHeapQuantizedVectors );
224236 }
225237
@@ -295,13 +307,8 @@ void writeCentroids(
295307 }
296308 writeQuantizedValue (centroidOutput , quantized , result );
297309 }
298- final ByteBuffer buffer = ByteBuffer . allocate ( fieldInfo . getVectorDimension () * Float . BYTES ). order ( ByteOrder . LITTLE_ENDIAN );
310+ // write the centroid offsets at the end of the file
299311 for (int i = 0 ; i < centroidSupplier .size (); i ++) {
300- float [] centroid = centroidSupplier .centroid (i );
301- buffer .asFloatBuffer ().put (centroid );
302- // write the centroids
303- centroidOutput .writeBytes (buffer .array (), buffer .array ().length );
304- // write the offset of this posting list
305312 centroidOutput .writeLong (offsets [i ]);
306313 }
307314 }
0 commit comments