elastic · iverase · Aug 6, 2025 · Aug 7, 2025 · Aug 7, 2025 · Aug 7, 2025
diff --git a/server/src/main/java/org/elasticsearch/index/codec/vectors/DefaultIVFVectorsReader.java b/server/src/main/java/org/elasticsearch/index/codec/vectors/DefaultIVFVectorsReader.java
@@ -321,11 +321,10 @@ private static class MemorySegmentPostingsVisitor implements PostingVisitor {
         final float[] correctionsAdd = new float[BULK_SIZE];
         final int[] docIdsScratch;
 
-        int vectors;
+        int totalVectors;
         boolean quantized = false;
         float centroidDp;
         final float[] centroid;
-        long slicePos;
         OptimizedScalarQuantizer.QuantizationResult queryCorrections;
         DocIdsWriter docIdsWriter = new DocIdsWriter();
 
@@ -367,12 +366,9 @@ public int resetPostingsScorer(long offset) throws IOException {
             indexInput.seek(offset);
             indexInput.readFloats(centroid, 0, centroid.length);
             centroidDp = Float.intBitsToFloat(indexInput.readInt());
-            vectors = indexInput.readVInt();
-            // read the doc ids
-            assert vectors <= docIdsScratch.length;
-            docIdsWriter.readInts(indexInput, vectors, docIdsScratch);
-            slicePos = indexInput.getFilePointer();
-            return vectors;
+            totalVectors = indexInput.readVInt();
+
+            return totalVectors;
         }
 
         float scoreIndividually(int offset) throws IOException {
@@ -381,13 +377,13 @@ float scoreIndividually(int offset) throws IOException {
             for (int j = 0; j < BULK_SIZE; j++) {
                 int doc = docIdsScratch[j + offset];
                 if (doc != -1) {
-                    indexInput.seek(slicePos + (offset * quantizedByteLength) + (j * quantizedVectorByteSize));
                     float qcDist = osqVectorsScorer.quantizeScore(quantizedQueryScratch);
                     scores[j] = qcDist;
+                } else {
+                    indexInput.skipBytes(quantizedVectorByteSize);
                 }
             }
             // read in all corrections
-            indexInput.seek(slicePos + (offset * quantizedByteLength) + (BULK_SIZE * quantizedVectorByteSize));
             indexInput.readFloats(correctionsLower, 0, BULK_SIZE);
             indexInput.readFloats(correctionsUpper, 0, BULK_SIZE);
             for (int j = 0; j < BULK_SIZE; j++) {
@@ -444,18 +440,36 @@ private static int collect(int[] docIds, int offset, KnnCollector knnCollector,
 
         @Override
         public int visit(KnnCollector knnCollector) throws IOException {
+            byte postingListType = indexInput.readByte();
+            if (postingListType == DefaultIVFVectorsWriter.SINGLE_BLOCK_POSTING_LIST) {
+                return singleBlockVisit(knnCollector, totalVectors);
+            } else {
+                assert postingListType == DefaultIVFVectorsWriter.MULTI_BLOCK_POSTING_LIST;
+                final int numBlocks = indexInput.readVInt();
+                int scoredDocs = 0;
+                for (int i = 0; i < numBlocks; i++) {
+                    final int numVectors = indexInput.readVInt();
+                    scoredDocs += singleBlockVisit(knnCollector, numVectors);
+                }
+                return scoredDocs;
+            }
+        }
+
+        private int singleBlockVisit(KnnCollector knnCollector, int numVectors) throws IOException {
+            assert numVectors <= docIdsScratch.length : "numVectors: " + numVectors + ", docIdsScratch.length: " + docIdsScratch.length;
+            docIdsWriter.readInts(indexInput, numVectors, docIdsScratch);
             // block processing
             int scoredDocs = 0;
-            int limit = vectors - BULK_SIZE + 1;
+            int limit = numVectors - BULK_SIZE + 1;
             int i = 0;
             for (; i < limit; i += BULK_SIZE) {
                 int docsToScore = BULK_SIZE - filterDocs(docIdsScratch, i, needsScoring);
                 if (docsToScore == 0) {
+                    indexInput.skipBytes(BULK_SIZE * quantizedByteLength);
                     continue;
                 }
                 quantizeQueryIfNecessary();
-                indexInput.seek(slicePos + i * quantizedByteLength);
-                float maxScore = Float.NEGATIVE_INFINITY;
+                float maxScore;
                 if (docsToScore < BULK_SIZE / 2) {
                     maxScore = scoreIndividually(i);
                 } else {
@@ -475,11 +489,10 @@ public int visit(KnnCollector knnCollector) throws IOException {
                 }
             }
             // process tail
-            for (; i < vectors; i++) {
+            for (; i < numVectors; i++) {
                 int doc = docIdsScratch[i];
                 if (needsScoring.test(doc)) {
                     quantizeQueryIfNecessary();
-                    indexInput.seek(slicePos + i * quantizedByteLength);
                     float qcDist = osqVectorsScorer.quantizeScore(quantizedQueryScratch);
                     indexInput.readFloats(correctiveValues, 0, 3);
                     final int quantizedComponentSum = Short.toUnsignedInt(indexInput.readShort());
@@ -498,6 +511,8 @@ public int visit(KnnCollector knnCollector) throws IOException {
                     );
                     scoredDocs++;
                     knnCollector.collect(doc, score);
+                } else {
+                    indexInput.skipBytes(quantizedByteLength);
                 }
             }
             if (scoredDocs > 0) {

diff --git a/server/src/main/java/org/elasticsearch/index/codec/vectors/DefaultIVFVectorsWriter.java b/server/src/main/java/org/elasticsearch/index/codec/vectors/DefaultIVFVectorsWriter.java
@@ -35,6 +35,7 @@
 import java.nio.ByteOrder;
 import java.util.AbstractList;
 import java.util.Arrays;
+import java.util.function.IntPredicate;
 
 /**
  * Default implementation of {@link IVFVectorsWriter}. It uses {@link HierarchicalKMeans} algorithm to
@@ -43,6 +44,10 @@
  */
 public class DefaultIVFVectorsWriter extends IVFVectorsWriter {
     private static final Logger logger = LogManager.getLogger(DefaultIVFVectorsWriter.class);
+    // posting lists bigger than that will be split in two or more blocks
+    private static final int MAX_POSTING_LIST_BLOCK_SIZE = 16 * 100;
+    public static final byte SINGLE_BLOCK_POSTING_LIST = 0;
+    public static final byte MULTI_BLOCK_POSTING_LIST = 1;
 
     private final int vectorPerCluster;
     private final int centroidsPerParentCluster;
@@ -98,7 +103,7 @@ LongValues buildAndWritePostingsLists(
             }
         }
         // write the max posting list size
-        postingsOutput.writeVInt(maxPostingListSize);
+        postingsOutput.writeVInt(Math.min(MAX_POSTING_LIST_BLOCK_SIZE, maxPostingListSize));
         // write the posting lists
         final PackedLongValues.Builder offsets = PackedLongValues.monotonicBuilder(PackedInts.COMPACT);
         DocIdsWriter docIdsWriter = new DocIdsWriter();
@@ -121,13 +126,31 @@ LongValues buildAndWritePostingsLists(
             int size = cluster.length;
             // write docIds
             postingsOutput.writeVInt(size);
-            onHeapQuantizedVectors.reset(centroid, size, ord -> cluster[ord]);
-            // TODO we might want to consider putting the docIds in a separate file
-            // to aid with only having to fetch vectors from slower storage when they are required
-            // keeping them in the same file indicates we pull the entire file into cache
-            docIdsWriter.writeDocIds(j -> floatVectorValues.ordToDoc(cluster[j]), size, postingsOutput);
-            // write vectors
-            bulkWriter.writeVectors(onHeapQuantizedVectors);
+            if (size > MAX_POSTING_LIST_BLOCK_SIZE) {
+                postingsOutput.writeByte(MULTI_BLOCK_POSTING_LIST);
+                writeOnHeapMultiBlockPostingList(
+                    postingsOutput,
+                    floatVectorValues,
+                    onHeapQuantizedVectors,
+                    centroid,
+                    cluster,
+                    size,
+                    docIdsWriter,
+                    bulkWriter
+                );
+            } else {
+                postingsOutput.writeByte(SINGLE_BLOCK_POSTING_LIST);
+                writeOnHeapSingleBlockPostingList(
+                    postingsOutput,
+                    floatVectorValues,
+                    onHeapQuantizedVectors,
+                    centroid,
+                    k -> cluster[k],
+                    size,
+                    docIdsWriter,
+                    bulkWriter
+                );
+            }
         }
 
         if (logger.isDebugEnabled()) {
@@ -137,6 +160,69 @@ LongValues buildAndWritePostingsLists(
         return offsets.build();
     }
 
+    private void writeOnHeapMultiBlockPostingList(
+        IndexOutput postingsOutput,
+        FloatVectorValues floatVectorValues,
+        OnHeapQuantizedVectors onHeapQuantizedVectors,
+        float[] centroid,
+        int[] cluster,
+        int size,
+        DocIdsWriter docIdsWriter,
+        DiskBBQBulkWriter bulkWriter
+    ) throws IOException {
+        int numBlocks = (int) Math.ceil((double) size / MAX_POSTING_LIST_BLOCK_SIZE);
+        postingsOutput.writeVInt(numBlocks);
+        for (int i = 0; i < numBlocks - 1; i++) {
+            int offset = MAX_POSTING_LIST_BLOCK_SIZE * i;
+            postingsOutput.writeVInt(MAX_POSTING_LIST_BLOCK_SIZE);
+            writeOnHeapSingleBlockPostingList(
+                postingsOutput,
+                floatVectorValues,
+                onHeapQuantizedVectors,
+                centroid,
+                k -> cluster[offset + k],
+                MAX_POSTING_LIST_BLOCK_SIZE,
+                docIdsWriter,
+                bulkWriter
+            );
+        }
+        int lastBlock = size - (numBlocks - 1) * MAX_POSTING_LIST_BLOCK_SIZE;
+        assert lastBlock >= 0;
+        if (lastBlock > 0) {
+            postingsOutput.writeVInt(lastBlock);
+            writeOnHeapSingleBlockPostingList(
+                postingsOutput,
+                floatVectorValues,
+                onHeapQuantizedVectors,
+                centroid,
+                k -> cluster[(numBlocks - 1) * MAX_POSTING_LIST_BLOCK_SIZE + k],
+                lastBlock,
+                docIdsWriter,
+                bulkWriter
+            );
+        }
+    }
+
+    private void writeOnHeapSingleBlockPostingList(
+        IndexOutput postingsOutput,
+        FloatVectorValues floatVectorValues,
+        OnHeapQuantizedVectors onHeapQuantizedVectors,
+        float[] centroid,
+        IntToIntFunction cluster,
+        int size,
+        DocIdsWriter docIdsWriter,
+        DiskBBQBulkWriter bulkWriter
+    ) throws IOException {
+
+        onHeapQuantizedVectors.reset(centroid, size, cluster);
+        // TODO we might want to consider putting the docIds in a separate file
+        // to aid with only having to fetch vectors from slower storage when they are required
+        // keeping them in the same file indicates we pull the entire file into cache
+        docIdsWriter.writeDocIds(j -> floatVectorValues.ordToDoc(cluster.apply(j)), size, postingsOutput);
+        // write vectors
+        bulkWriter.writeVectors(onHeapQuantizedVectors);
+    }
+
     @Override
     LongValues buildAndWritePostingsLists(
         FieldInfo fieldInfo,
@@ -237,7 +323,7 @@ LongValues buildAndWritePostingsLists(
             DiskBBQBulkWriter bulkWriter = new DiskBBQBulkWriter.OneBitDiskBBQBulkWriter(ES91OSQVectorsScorer.BULK_SIZE, postingsOutput);
             final ByteBuffer buffer = ByteBuffer.allocate(fieldInfo.getVectorDimension() * Float.BYTES).order(ByteOrder.LITTLE_ENDIAN);
             // write the max posting list size
-            postingsOutput.writeVInt(maxPostingListSize);
+            postingsOutput.writeVInt(Math.min(MAX_POSTING_LIST_BLOCK_SIZE, maxPostingListSize));
             // write the posting lists
             for (int c = 0; c < centroidSupplier.size(); c++) {
                 float[] centroid = centroidSupplier.centroid(c);
@@ -252,13 +338,31 @@ LongValues buildAndWritePostingsLists(
                 // write docIds
                 int size = cluster.length;
                 postingsOutput.writeVInt(size);
-                offHeapQuantizedVectors.reset(size, ord -> isOverspill[ord], ord -> cluster[ord]);
-                // TODO we might want to consider putting the docIds in a separate file
-                // to aid with only having to fetch vectors from slower storage when they are required
-                // keeping them in the same file indicates we pull the entire file into cache
-                docIdsWriter.writeDocIds(j -> floatVectorValues.ordToDoc(cluster[j]), size, postingsOutput);
-                // write vectors
-                bulkWriter.writeVectors(offHeapQuantizedVectors);
+                if (size > MAX_POSTING_LIST_BLOCK_SIZE) {
+                    postingsOutput.writeByte(MULTI_BLOCK_POSTING_LIST);
+                    writeOffHeapMultiBlockPostingList(
+                        postingsOutput,
+                        floatVectorValues,
+                        offHeapQuantizedVectors,
+                        cluster,
+                        size,
+                        isOverspill,
+                        docIdsWriter,
+                        bulkWriter
+                    );
+                } else {
+                    postingsOutput.writeByte(SINGLE_BLOCK_POSTING_LIST);
+                    writeOffHeapBlockPostingList(
+                        postingsOutput,
+                        floatVectorValues,
+                        offHeapQuantizedVectors,
+                        k -> cluster[k],
+                        size,
+                        b -> isOverspill[b],
+                        docIdsWriter,
+                        bulkWriter
+                    );
+                }
             }
 
             if (logger.isDebugEnabled()) {
@@ -268,6 +372,69 @@ LongValues buildAndWritePostingsLists(
         }
     }
 
+    private void writeOffHeapMultiBlockPostingList(
+        IndexOutput postingsOutput,
+        FloatVectorValues floatVectorValues,
+        OffHeapQuantizedVectors offHeapQuantizedVectors,
+        int[] cluster,
+        int size,
+        boolean[] isOverspill,
+        DocIdsWriter docIdsWriter,
+        DiskBBQBulkWriter bulkWriter
+    ) throws IOException {
+        int numBlocks = (int) Math.ceil((double) size / MAX_POSTING_LIST_BLOCK_SIZE);
+        postingsOutput.writeVInt(numBlocks);
+        for (int i = 0; i < numBlocks - 1; i++) {
+            int offset = MAX_POSTING_LIST_BLOCK_SIZE * i;
+            postingsOutput.writeVInt(MAX_POSTING_LIST_BLOCK_SIZE);
+            writeOffHeapBlockPostingList(
+                postingsOutput,
+                floatVectorValues,
+                offHeapQuantizedVectors,
+                k -> cluster[offset + k],
+                MAX_POSTING_LIST_BLOCK_SIZE,
+                b -> isOverspill[offset + b],
+                docIdsWriter,
+                bulkWriter
+            );
+        }
+        int lastBlock = size - (numBlocks - 1) * MAX_POSTING_LIST_BLOCK_SIZE;
+        assert lastBlock >= 0;
+        if (lastBlock > 0) {
+            postingsOutput.writeVInt(lastBlock);
+            writeOffHeapBlockPostingList(
+                postingsOutput,
+                floatVectorValues,
+                offHeapQuantizedVectors,
+                k -> cluster[(numBlocks - 1) * MAX_POSTING_LIST_BLOCK_SIZE + k],
+                lastBlock,
+                b -> isOverspill[(numBlocks - 1) * MAX_POSTING_LIST_BLOCK_SIZE + b],
+                docIdsWriter,
+                bulkWriter
+            );
+        }
+    }
+
+    private void writeOffHeapBlockPostingList(
+        IndexOutput postingsOutput,
+        FloatVectorValues floatVectorValues,
+        OffHeapQuantizedVectors offHeapQuantizedVectors,
+        IntToIntFunction cluster,
+        int size,
+        IntPredicate isOverspill,
+        DocIdsWriter docIdsWriter,
+        DiskBBQBulkWriter bulkWriter
+    ) throws IOException {
+
+        offHeapQuantizedVectors.reset(size, isOverspill::test, cluster);
+        // TODO we might want to consider putting the docIds in a separate file
+        // to aid with only having to fetch vectors from slower storage when they are required
+        // keeping them in the same file indicates we pull the entire file into cache
+        docIdsWriter.writeDocIds(j -> floatVectorValues.ordToDoc(cluster.apply(j)), size, postingsOutput);
+        // write vectors
+        bulkWriter.writeVectors(offHeapQuantizedVectors);
+    }
+
     private static void printClusterQualityStatistics(int[][] clusters) {
         float min = Float.MAX_VALUE;
         float max = Float.MIN_VALUE;