Skip to content

Commit b57ee3b

Browse files
authored
[DiskBBQ] Remove posting lists offsets from meta file (elastic#131379)
this commit proposes to remove the offsets list from the meta file, instead it adds the offsets just after the centroids, together with the raw centroid.
1 parent 9226a65 commit b57ee3b

File tree

4 files changed

+103
-102
lines changed

4 files changed

+103
-102
lines changed

server/src/main/java/org/elasticsearch/index/codec/vectors/DefaultIVFVectorsReader.java

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -67,10 +67,11 @@ CentroidQueryScorer getCentroidScorer(FieldInfo fieldInfo, int numCentroids, Ind
6767
final ES91Int4VectorsScorer scorer = ESVectorUtil.getES91Int4VectorsScorer(centroids, fieldInfo.getVectorDimension());
6868
return new CentroidQueryScorer() {
6969
int currentCentroid = -1;
70+
long postingListOffset;
7071
private final float[] centroid = new float[fieldInfo.getVectorDimension()];
7172
private final float[] centroidCorrectiveValues = new float[3];
7273
private final long rawCentroidsOffset = (long) numCentroids * (fieldInfo.getVectorDimension() + 3 * Float.BYTES + Short.BYTES);
73-
private final long rawCentroidsByteSize = (long) Float.BYTES * fieldInfo.getVectorDimension();
74+
private final long rawCentroidsByteSize = (long) Float.BYTES * fieldInfo.getVectorDimension() + Long.BYTES;
7475

7576
@Override
7677
public int size() {
@@ -79,12 +80,23 @@ public int size() {
7980

8081
@Override
8182
public float[] centroid(int centroidOrdinal) throws IOException {
83+
readDataIfNecessary(centroidOrdinal);
84+
return centroid;
85+
}
86+
87+
@Override
88+
public long postingListOffset(int centroidOrdinal) throws IOException {
89+
readDataIfNecessary(centroidOrdinal);
90+
return postingListOffset;
91+
}
92+
93+
private void readDataIfNecessary(int centroidOrdinal) throws IOException {
8294
if (centroidOrdinal != currentCentroid) {
8395
centroids.seek(rawCentroidsOffset + rawCentroidsByteSize * centroidOrdinal);
8496
centroids.readFloats(centroid, 0, centroid.length);
97+
postingListOffset = centroids.readLong();
8598
currentCentroid = centroidOrdinal;
8699
}
87-
return centroid;
88100
}
89101

90102
public void bulkScore(NeighborQueue queue) throws IOException {
@@ -217,9 +229,9 @@ private static class MemorySegmentPostingsVisitor implements PostingVisitor {
217229
}
218230

219231
@Override
220-
public int resetPostingsScorer(int centroidOrdinal, float[] centroid) throws IOException {
232+
public int resetPostingsScorer(long offset, float[] centroid) throws IOException {
221233
quantized = false;
222-
indexInput.seek(entry.postingListOffsets()[centroidOrdinal]);
234+
indexInput.seek(offset);
223235
vectors = indexInput.readVInt();
224236
centroidDp = Float.intBitsToFloat(indexInput.readInt());
225237
this.centroid = centroid;

server/src/main/java/org/elasticsearch/index/codec/vectors/DefaultIVFVectorsWriter.java

Lines changed: 27 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -265,71 +265,66 @@ CentroidSupplier createCentroidSupplier(IndexInput centroidsInput, int numCentro
265265
return new OffHeapCentroidSupplier(centroidsInput, numCentroids, fieldInfo);
266266
}
267267

268-
static void writeCentroids(float[][] centroids, FieldInfo fieldInfo, float[] globalCentroid, IndexOutput centroidOutput)
269-
throws IOException {
268+
@Override
269+
void writeCentroids(
270+
FieldInfo fieldInfo,
271+
CentroidSupplier centroidSupplier,
272+
float[] globalCentroid,
273+
long[] offsets,
274+
IndexOutput centroidOutput
275+
) throws IOException {
276+
270277
final OptimizedScalarQuantizer osq = new OptimizedScalarQuantizer(fieldInfo.getVectorSimilarityFunction());
271278
int[] quantizedScratch = new int[fieldInfo.getVectorDimension()];
272279
float[] centroidScratch = new float[fieldInfo.getVectorDimension()];
273280
final byte[] quantized = new byte[fieldInfo.getVectorDimension()];
274281
// TODO do we want to store these distances as well for future use?
275282
// TODO: sort centroids by global centroid (was doing so previously here)
276283
// TODO: sorting tanks recall possibly because centroids ordinals no longer are aligned
277-
for (float[] centroid : centroids) {
284+
for (int i = 0; i < centroidSupplier.size(); i++) {
285+
float[] centroid = centroidSupplier.centroid(i);
278286
System.arraycopy(centroid, 0, centroidScratch, 0, centroid.length);
279287
OptimizedScalarQuantizer.QuantizationResult result = osq.scalarQuantize(
280288
centroidScratch,
281289
quantizedScratch,
282290
(byte) 4,
283291
globalCentroid
284292
);
285-
for (int i = 0; i < quantizedScratch.length; i++) {
286-
quantized[i] = (byte) quantizedScratch[i];
293+
for (int j = 0; j < quantizedScratch.length; j++) {
294+
quantized[j] = (byte) quantizedScratch[j];
287295
}
288296
writeQuantizedValue(centroidOutput, quantized, result);
289297
}
290298
final ByteBuffer buffer = ByteBuffer.allocate(fieldInfo.getVectorDimension() * Float.BYTES).order(ByteOrder.LITTLE_ENDIAN);
291-
for (float[] centroid : centroids) {
299+
for (int i = 0; i < centroidSupplier.size(); i++) {
300+
float[] centroid = centroidSupplier.centroid(i);
292301
buffer.asFloatBuffer().put(centroid);
302+
// write the centroids
293303
centroidOutput.writeBytes(buffer.array(), buffer.array().length);
304+
// write the offset of this posting list
305+
centroidOutput.writeLong(offsets[i]);
294306
}
295307
}
296308

297-
@Override
298-
CentroidAssignments calculateAndWriteCentroids(
299-
FieldInfo fieldInfo,
300-
FloatVectorValues floatVectorValues,
301-
IndexOutput centroidOutput,
302-
MergeState mergeState,
303-
float[] globalCentroid
304-
) throws IOException {
305-
// TODO: take advantage of prior generated clusters from mergeState in the future
306-
return calculateAndWriteCentroids(fieldInfo, floatVectorValues, centroidOutput, globalCentroid);
307-
}
308-
309309
/**
310-
* Calculate the centroids for the given field and write them to the given centroid output.
310+
* Calculate the centroids for the given field.
311311
* We use the {@link HierarchicalKMeans} algorithm to partition the space of all vectors across merging segments
312312
*
313313
* @param fieldInfo merging field info
314314
* @param floatVectorValues the float vector values to merge
315-
* @param centroidOutput the centroid output
316315
* @param globalCentroid the global centroid, calculated by this method and used to quantize the centroids
317316
* @return the vector assignments, soar assignments, and if asked the centroids themselves that were computed
318317
* @throws IOException if an I/O error occurs
319318
*/
320319
@Override
321-
CentroidAssignments calculateAndWriteCentroids(
322-
FieldInfo fieldInfo,
323-
FloatVectorValues floatVectorValues,
324-
IndexOutput centroidOutput,
325-
float[] globalCentroid
326-
) throws IOException {
320+
CentroidAssignments calculateCentroids(FieldInfo fieldInfo, FloatVectorValues floatVectorValues, float[] globalCentroid)
321+
throws IOException {
327322

328323
long nanoTime = System.nanoTime();
329324

330325
// TODO: consider hinting / bootstrapping hierarchical kmeans with the prior segments centroids
331-
KMeansResult kMeansResult = new HierarchicalKMeans(floatVectorValues.dimension()).cluster(floatVectorValues, vectorPerCluster);
332-
float[][] centroids = kMeansResult.centroids();
326+
CentroidAssignments centroidAssignments = buildCentroidAssignments(floatVectorValues, vectorPerCluster);
327+
float[][] centroids = centroidAssignments.centroids();
333328
// TODO: for flush we are doing this over the vectors and here centroids which seems duplicative
334329
// preliminary tests suggest recall is good using only centroids but need to do further evaluation
335330
// TODO: push this logic into vector util?
@@ -342,17 +337,15 @@ CentroidAssignments calculateAndWriteCentroids(
342337
globalCentroid[j] /= centroids.length;
343338
}
344339

345-
// write centroids
346-
writeCentroids(centroids, fieldInfo, globalCentroid, centroidOutput);
347-
348340
if (logger.isDebugEnabled()) {
349341
logger.debug("calculate centroids and assign vectors time ms: {}", (System.nanoTime() - nanoTime) / 1000000.0);
350342
logger.debug("final centroid count: {}", centroids.length);
351343
}
352-
return buildCentroidAssignments(kMeansResult);
344+
return centroidAssignments;
353345
}
354346

355-
static CentroidAssignments buildCentroidAssignments(KMeansResult kMeansResult) {
347+
static CentroidAssignments buildCentroidAssignments(FloatVectorValues floatVectorValues, int vectorPerCluster) throws IOException {
348+
KMeansResult kMeansResult = new HierarchicalKMeans(floatVectorValues.dimension()).cluster(floatVectorValues, vectorPerCluster);
356349
float[][] centroids = kMeansResult.centroids();
357350
int[] assignments = kMeansResult.assignments();
358351
int[] soarAssignments = kMeansResult.soarAssignments();
@@ -374,15 +367,13 @@ static class OffHeapCentroidSupplier implements CentroidSupplier {
374367
private final int numCentroids;
375368
private final int dimension;
376369
private final float[] scratch;
377-
private final long rawCentroidOffset;
378370
private int currOrd = -1;
379371

380372
OffHeapCentroidSupplier(IndexInput centroidsInput, int numCentroids, FieldInfo info) {
381373
this.centroidsInput = centroidsInput;
382374
this.numCentroids = numCentroids;
383375
this.dimension = info.getVectorDimension();
384376
this.scratch = new float[dimension];
385-
this.rawCentroidOffset = (dimension + 3 * Float.BYTES + Short.BYTES) * numCentroids;
386377
}
387378

388379
@Override
@@ -395,7 +386,7 @@ public float[] centroid(int centroidOrdinal) throws IOException {
395386
if (centroidOrdinal == currOrd) {
396387
return scratch;
397388
}
398-
centroidsInput.seek(rawCentroidOffset + (long) centroidOrdinal * dimension * Float.BYTES);
389+
centroidsInput.seek((long) centroidOrdinal * dimension * Float.BYTES);
399390
centroidsInput.readFloats(scratch, 0, dimension);
400391
this.currOrd = centroidOrdinal;
401392
return scratch;

server/src/main/java/org/elasticsearch/index/codec/vectors/IVFVectorsReader.java

Lines changed: 23 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -140,19 +140,6 @@ private void readFields(ChecksumIndexInput meta) throws IOException {
140140
private FieldEntry readField(IndexInput input, FieldInfo info) throws IOException {
141141
final VectorEncoding vectorEncoding = readVectorEncoding(input);
142142
final VectorSimilarityFunction similarityFunction = readSimilarityFunction(input);
143-
final long centroidOffset = input.readLong();
144-
final long centroidLength = input.readLong();
145-
final int numPostingLists = input.readVInt();
146-
final long[] postingListOffsets = new long[numPostingLists];
147-
for (int i = 0; i < numPostingLists; i++) {
148-
postingListOffsets[i] = input.readLong();
149-
}
150-
final float[] globalCentroid = new float[info.getVectorDimension()];
151-
float globalCentroidDp = 0;
152-
if (numPostingLists > 0) {
153-
input.readFloats(globalCentroid, 0, globalCentroid.length);
154-
globalCentroidDp = Float.intBitsToFloat(input.readInt());
155-
}
156143
if (similarityFunction != info.getVectorSimilarityFunction()) {
157144
throw new IllegalStateException(
158145
"Inconsistent vector similarity function for field=\""
@@ -163,12 +150,21 @@ private FieldEntry readField(IndexInput input, FieldInfo info) throws IOExceptio
163150
+ info.getVectorSimilarityFunction()
164151
);
165152
}
153+
final int numCentroids = input.readInt();
154+
final long centroidOffset = input.readLong();
155+
final long centroidLength = input.readLong();
156+
final float[] globalCentroid = new float[info.getVectorDimension()];
157+
float globalCentroidDp = 0;
158+
if (centroidLength > 0) {
159+
input.readFloats(globalCentroid, 0, globalCentroid.length);
160+
globalCentroidDp = Float.intBitsToFloat(input.readInt());
161+
}
166162
return new FieldEntry(
167163
similarityFunction,
168164
vectorEncoding,
165+
numCentroids,
169166
centroidOffset,
170167
centroidLength,
171-
postingListOffsets,
172168
globalCentroid,
173169
globalCentroidDp
174170
);
@@ -242,7 +238,7 @@ public final void search(String field, float[] target, KnnCollector knnCollector
242238
FieldEntry entry = fields.get(fieldInfo.number);
243239
CentroidQueryScorer centroidQueryScorer = getCentroidScorer(
244240
fieldInfo,
245-
entry.postingListOffsets.length,
241+
entry.numCentroids,
246242
entry.centroidSlice(ivfCentroids),
247243
target
248244
);
@@ -270,7 +266,10 @@ public final void search(String field, float[] target, KnnCollector knnCollector
270266
int centroidOrdinal = centroidQueue.pop();
271267
// todo do we need direct access to the raw centroid???, this is used for quantizing, maybe hydrating and quantizing
272268
// is enough?
273-
expectedDocs += scorer.resetPostingsScorer(centroidOrdinal, centroidQueryScorer.centroid(centroidOrdinal));
269+
expectedDocs += scorer.resetPostingsScorer(
270+
centroidQueryScorer.postingListOffset(centroidOrdinal),
271+
centroidQueryScorer.centroid(centroidOrdinal)
272+
);
274273
actualDocs += scorer.visit(knnCollector);
275274
}
276275
if (acceptDocs != null) {
@@ -279,7 +278,10 @@ public final void search(String field, float[] target, KnnCollector knnCollector
279278
float expectedScored = Math.min(2 * filteredVectors * unfilteredRatioVisited, expectedDocs / 2f);
280279
while (centroidQueue.size() > 0 && (actualDocs < expectedScored || actualDocs < knnCollector.k())) {
281280
int centroidOrdinal = centroidQueue.pop();
282-
scorer.resetPostingsScorer(centroidOrdinal, centroidQueryScorer.centroid(centroidOrdinal));
281+
scorer.resetPostingsScorer(
282+
centroidQueryScorer.postingListOffset(centroidOrdinal),
283+
centroidQueryScorer.centroid(centroidOrdinal)
284+
);
283285
actualDocs += scorer.visit(knnCollector);
284286
}
285287
}
@@ -313,9 +315,9 @@ public void close() throws IOException {
313315
protected record FieldEntry(
314316
VectorSimilarityFunction similarityFunction,
315317
VectorEncoding vectorEncoding,
318+
int numCentroids,
316319
long centroidOffset,
317320
long centroidLength,
318-
long[] postingListOffsets,
319321
float[] globalCentroid,
320322
float globalCentroidDp
321323
) {
@@ -332,14 +334,16 @@ interface CentroidQueryScorer {
332334

333335
float[] centroid(int centroidOrdinal) throws IOException;
334336

337+
long postingListOffset(int centroidOrdinal) throws IOException;
338+
335339
void bulkScore(NeighborQueue queue) throws IOException;
336340
}
337341

338342
interface PostingVisitor {
339343
// TODO maybe we can not specifically pass the centroid...
340344

341345
/** returns the number of documents in the posting list */
342-
int resetPostingsScorer(int centroidOrdinal, float[] centroid) throws IOException;
346+
int resetPostingsScorer(long offset, float[] centroid) throws IOException;
343347

344348
/** returns the number of scored documents */
345349
int visit(KnnCollector collector) throws IOException;

0 commit comments

Comments
 (0)