From 4d2841e37eb5224b3b470b71a07ef92fdbd37313 Mon Sep 17 00:00:00 2001 From: Benjamin Trent <4357155+benwtrent@users.noreply.github.com> Date: Mon, 11 Aug 2025 14:45:06 -0400 Subject: [PATCH 1/6] Sort postings lists by doc ID --- .../vectors/DefaultIVFVectorsReader.java | 9 +- .../vectors/DefaultIVFVectorsWriter.java | 84 +++++++++++++++++-- 2 files changed, 86 insertions(+), 7 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/index/codec/vectors/DefaultIVFVectorsReader.java b/server/src/main/java/org/elasticsearch/index/codec/vectors/DefaultIVFVectorsReader.java index 312172f251dda..2aec61ddd8aa5 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/vectors/DefaultIVFVectorsReader.java +++ b/server/src/main/java/org/elasticsearch/index/codec/vectors/DefaultIVFVectorsReader.java @@ -16,6 +16,7 @@ import org.apache.lucene.search.KnnCollector; import org.apache.lucene.store.IndexInput; import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.GroupVIntUtil; import org.apache.lucene.util.VectorUtil; import org.apache.lucene.util.hnsw.NeighborQueue; import org.elasticsearch.index.codec.vectors.reflect.OffHeapStats; @@ -370,7 +371,13 @@ public int resetPostingsScorer(long offset) throws IOException { vectors = indexInput.readVInt(); // read the doc ids assert vectors <= docIdsScratch.length; - docIdsWriter.readInts(indexInput, vectors, docIdsScratch); + GroupVIntUtil.readGroupVInts(indexInput, docIdsScratch, vectors); + // reconstitute from the deltas + int sum = 0; + for (int i = 1; i < vectors; i++) { + sum += docIdsScratch[i]; + docIdsScratch[i] = sum; + } slicePos = indexInput.getFilePointer(); return vectors; } diff --git a/server/src/main/java/org/elasticsearch/index/codec/vectors/DefaultIVFVectorsWriter.java b/server/src/main/java/org/elasticsearch/index/codec/vectors/DefaultIVFVectorsWriter.java index d16163d6934e8..6e5f09538d785 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/vectors/DefaultIVFVectorsWriter.java +++ b/server/src/main/java/org/elasticsearch/index/codec/vectors/DefaultIVFVectorsWriter.java @@ -17,6 +17,7 @@ import org.apache.lucene.store.IOContext; import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.IntroSorter; import org.apache.lucene.util.LongValues; import org.apache.lucene.util.VectorUtil; import org.apache.lucene.util.hnsw.IntToIntFunction; @@ -101,7 +102,6 @@ LongValues buildAndWritePostingsLists( postingsOutput.writeVInt(maxPostingListSize); // write the posting lists final PackedLongValues.Builder offsets = PackedLongValues.monotonicBuilder(PackedInts.COMPACT); - DocIdsWriter docIdsWriter = new DocIdsWriter(); DiskBBQBulkWriter bulkWriter = new DiskBBQBulkWriter.OneBitDiskBBQBulkWriter(ES91OSQVectorsScorer.BULK_SIZE, postingsOutput); OnHeapQuantizedVectors onHeapQuantizedVectors = new OnHeapQuantizedVectors( floatVectorValues, @@ -109,6 +109,9 @@ LongValues buildAndWritePostingsLists( new OptimizedScalarQuantizer(fieldInfo.getVectorSimilarityFunction()) ); final ByteBuffer buffer = ByteBuffer.allocate(fieldInfo.getVectorDimension() * Float.BYTES).order(ByteOrder.LITTLE_ENDIAN); + int[] docIds = null; + int[] docDeltas = null; + int[] clusterOrds = null; for (int c = 0; c < centroidSupplier.size(); c++) { float[] centroid = centroidSupplier.centroid(c); int[] cluster = assignmentsByCluster[c]; @@ -121,11 +124,28 @@ LongValues buildAndWritePostingsLists( int size = cluster.length; // write docIds postingsOutput.writeVInt(size); - onHeapQuantizedVectors.reset(centroid, size, ord -> cluster[ord]); + if (docIds == null || docIds.length < cluster.length) { + docIds = new int[cluster.length]; + clusterOrds = new int[cluster.length]; + docDeltas = new int[cluster.length]; + } + for (int j = 0; j < size; j++) { + docIds[j] = floatVectorValues.ordToDoc(cluster[j]); + clusterOrds[j] = j; + } + final int[] finalDocs = docIds; + final int[] finalOrds = clusterOrds; + // sort cluster.buffer by docIds values, this way cluster ordinals are sorted by docIds + new IntSorter(clusterOrds, i -> finalDocs[i]).sort(0, size); + // encode doc deltas + for (int j = 0; j < size; j++) { + docDeltas[j] = j == 0 ? finalDocs[finalOrds[j]] : finalDocs[finalOrds[j]] - finalDocs[finalOrds[j - 1]]; + } + onHeapQuantizedVectors.reset(centroid, size, ord -> cluster[finalOrds[ord]]); // TODO we might want to consider putting the docIds in a separate file // to aid with only having to fetch vectors from slower storage when they are required // keeping them in the same file indicates we pull the entire file into cache - docIdsWriter.writeDocIds(j -> floatVectorValues.ordToDoc(cluster[j]), size, postingsOutput); + postingsOutput.writeGroupVInts(docDeltas, size); // write vectors bulkWriter.writeVectors(onHeapQuantizedVectors); } @@ -233,12 +253,14 @@ LongValues buildAndWritePostingsLists( quantizedVectorsInput, fieldInfo.getVectorDimension() ); - DocIdsWriter docIdsWriter = new DocIdsWriter(); DiskBBQBulkWriter bulkWriter = new DiskBBQBulkWriter.OneBitDiskBBQBulkWriter(ES91OSQVectorsScorer.BULK_SIZE, postingsOutput); final ByteBuffer buffer = ByteBuffer.allocate(fieldInfo.getVectorDimension() * Float.BYTES).order(ByteOrder.LITTLE_ENDIAN); // write the max posting list size postingsOutput.writeVInt(maxPostingListSize); // write the posting lists + int[] docIds = null; + int[] docDeltas = null; + int[] clusterOrds = null; for (int c = 0; c < centroidSupplier.size(); c++) { float[] centroid = centroidSupplier.centroid(c); int[] cluster = assignmentsByCluster[c]; @@ -252,11 +274,28 @@ LongValues buildAndWritePostingsLists( // write docIds int size = cluster.length; postingsOutput.writeVInt(size); - offHeapQuantizedVectors.reset(size, ord -> isOverspill[ord], ord -> cluster[ord]); + if (docIds == null || docIds.length < cluster.length) { + docIds = new int[cluster.length]; + clusterOrds = new int[cluster.length]; + docDeltas = new int[cluster.length]; + } + for (int j = 0; j < size; j++) { + docIds[j] = floatVectorValues.ordToDoc(cluster[j]); + clusterOrds[j] = j; + } + final int[] finalDocs = docIds; + final int[] finalOrds = clusterOrds; + // sort cluster.buffer by docIds values, this way cluster ordinals are sorted by docIds + new IntSorter(clusterOrds, i -> finalDocs[i]).sort(0, size); + // encode doc deltas + for (int j = 0; j < size; j++) { + docDeltas[j] = j == 0 ? finalDocs[finalOrds[j]] : finalDocs[finalOrds[j]] - finalDocs[finalOrds[j - 1]]; + } + offHeapQuantizedVectors.reset(size, ord -> isOverspill[finalOrds[ord]], ord -> cluster[finalOrds[ord]]); // TODO we might want to consider putting the docIds in a separate file // to aid with only having to fetch vectors from slower storage when they are required // keeping them in the same file indicates we pull the entire file into cache - docIdsWriter.writeDocIds(j -> floatVectorValues.ordToDoc(cluster[j]), size, postingsOutput); + postingsOutput.writeGroupVInts(docDeltas, size); // write vectors bulkWriter.writeVectors(offHeapQuantizedVectors); } @@ -717,4 +756,37 @@ public void readQuantizedVector(int ord, boolean isOverspill) throws IOException bitSum = quantizedVectorsInput.readShort(); } } + + private static class IntSorter extends IntroSorter { + int pivot = -1; + private final int[] arr; + private final IntToIntFunction func; + + private IntSorter(int[] arr, IntToIntFunction func) { + this.arr = arr; + this.func = func; + } + + @Override + protected void setPivot(int i) { + pivot = func.apply(arr[i]); + } + + @Override + protected int comparePivot(int j) { + return Integer.compare(pivot, func.apply(arr[j])); + } + + @Override + protected int compare(int a, int b) { + return Integer.compare(func.apply(arr[a]), func.apply(arr[b])); + } + + @Override + protected void swap(int i, int j) { + final int tmp = arr[i]; + arr[i] = arr[j]; + arr[j] = tmp; + } + } } From 8d8ea3611320e930b664fbb028e9ffb1f3e31964 Mon Sep 17 00:00:00 2001 From: Benjamin Trent <4357155+benwtrent@users.noreply.github.com> Date: Mon, 11 Aug 2025 16:22:42 -0400 Subject: [PATCH 2/6] iter --- .../index/codec/vectors/DefaultIVFVectorsReader.java | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/index/codec/vectors/DefaultIVFVectorsReader.java b/server/src/main/java/org/elasticsearch/index/codec/vectors/DefaultIVFVectorsReader.java index 8759eb3d5420d..6460ef4b1766f 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/vectors/DefaultIVFVectorsReader.java +++ b/server/src/main/java/org/elasticsearch/index/codec/vectors/DefaultIVFVectorsReader.java @@ -327,7 +327,6 @@ private static class MemorySegmentPostingsVisitor implements PostingVisitor { final float[] centroid; long slicePos; OptimizedScalarQuantizer.QuantizationResult queryCorrections; - DocIdsWriter docIdsWriter = new DocIdsWriter(); final float[] scratch; final int[] quantizationScratch; @@ -373,7 +372,7 @@ public int resetPostingsScorer(long offset) throws IOException { GroupVIntUtil.readGroupVInts(indexInput, docIdsScratch, vectors); // reconstitute from the deltas int sum = 0; - for (int i = 1; i < vectors; i++) { + for (int i = 0; i < vectors; i++) { sum += docIdsScratch[i]; docIdsScratch[i] = sum; } From ba91845a433f226c1def39f1ace919b8c4173e41 Mon Sep 17 00:00:00 2001 From: Benjamin Trent <4357155+benwtrent@users.noreply.github.com> Date: Mon, 11 Aug 2025 17:39:24 -0400 Subject: [PATCH 3/6] delta encode --- .../index/codec/vectors/DefaultIVFVectorsReader.java | 4 ++-- .../index/codec/vectors/DefaultIVFVectorsWriter.java | 8 ++++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/index/codec/vectors/DefaultIVFVectorsReader.java b/server/src/main/java/org/elasticsearch/index/codec/vectors/DefaultIVFVectorsReader.java index 6460ef4b1766f..d80fc216e556c 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/vectors/DefaultIVFVectorsReader.java +++ b/server/src/main/java/org/elasticsearch/index/codec/vectors/DefaultIVFVectorsReader.java @@ -16,7 +16,6 @@ import org.apache.lucene.search.KnnCollector; import org.apache.lucene.store.IndexInput; import org.apache.lucene.util.ArrayUtil; -import org.apache.lucene.util.GroupVIntUtil; import org.apache.lucene.util.Bits; import org.apache.lucene.util.VectorUtil; import org.apache.lucene.util.hnsw.NeighborQueue; @@ -332,6 +331,7 @@ private static class MemorySegmentPostingsVisitor implements PostingVisitor { final int[] quantizationScratch; final byte[] quantizedQueryScratch; final OptimizedScalarQuantizer quantizer; + final DocIdsWriter idsWriter = new DocIdsWriter(); final float[] correctiveValues = new float[3]; final long quantizedVectorByteSize; @@ -369,7 +369,7 @@ public int resetPostingsScorer(long offset) throws IOException { vectors = indexInput.readVInt(); // read the doc ids assert vectors <= docIdsScratch.length; - GroupVIntUtil.readGroupVInts(indexInput, docIdsScratch, vectors); + idsWriter.readInts(indexInput, vectors, docIdsScratch); // reconstitute from the deltas int sum = 0; for (int i = 0; i < vectors; i++) { diff --git a/server/src/main/java/org/elasticsearch/index/codec/vectors/DefaultIVFVectorsWriter.java b/server/src/main/java/org/elasticsearch/index/codec/vectors/DefaultIVFVectorsWriter.java index 6e5f09538d785..444ff9cf18a0a 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/vectors/DefaultIVFVectorsWriter.java +++ b/server/src/main/java/org/elasticsearch/index/codec/vectors/DefaultIVFVectorsWriter.java @@ -112,6 +112,7 @@ LongValues buildAndWritePostingsLists( int[] docIds = null; int[] docDeltas = null; int[] clusterOrds = null; + DocIdsWriter idsWriter = new DocIdsWriter(); for (int c = 0; c < centroidSupplier.size(); c++) { float[] centroid = centroidSupplier.centroid(c); int[] cluster = assignmentsByCluster[c]; @@ -141,11 +142,12 @@ LongValues buildAndWritePostingsLists( for (int j = 0; j < size; j++) { docDeltas[j] = j == 0 ? finalDocs[finalOrds[j]] : finalDocs[finalOrds[j]] - finalDocs[finalOrds[j - 1]]; } + final int[] finalDocDeltas = docDeltas; onHeapQuantizedVectors.reset(centroid, size, ord -> cluster[finalOrds[ord]]); // TODO we might want to consider putting the docIds in a separate file // to aid with only having to fetch vectors from slower storage when they are required // keeping them in the same file indicates we pull the entire file into cache - postingsOutput.writeGroupVInts(docDeltas, size); + idsWriter.writeDocIds(i -> finalDocDeltas[i], size, postingsOutput); // write vectors bulkWriter.writeVectors(onHeapQuantizedVectors); } @@ -261,6 +263,7 @@ LongValues buildAndWritePostingsLists( int[] docIds = null; int[] docDeltas = null; int[] clusterOrds = null; + DocIdsWriter idsWriter = new DocIdsWriter(); for (int c = 0; c < centroidSupplier.size(); c++) { float[] centroid = centroidSupplier.centroid(c); int[] cluster = assignmentsByCluster[c]; @@ -291,11 +294,12 @@ LongValues buildAndWritePostingsLists( for (int j = 0; j < size; j++) { docDeltas[j] = j == 0 ? finalDocs[finalOrds[j]] : finalDocs[finalOrds[j]] - finalDocs[finalOrds[j - 1]]; } + final int[] finalDocDeltas = docDeltas; offHeapQuantizedVectors.reset(size, ord -> isOverspill[finalOrds[ord]], ord -> cluster[finalOrds[ord]]); // TODO we might want to consider putting the docIds in a separate file // to aid with only having to fetch vectors from slower storage when they are required // keeping them in the same file indicates we pull the entire file into cache - postingsOutput.writeGroupVInts(docDeltas, size); + idsWriter.writeDocIds(i -> finalDocDeltas[i], size, postingsOutput); // write vectors bulkWriter.writeVectors(offHeapQuantizedVectors); } From 757c14949cf9399ad85b598d29555dc925cab539 Mon Sep 17 00:00:00 2001 From: Benjamin Trent Date: Tue, 12 Aug 2025 09:57:06 -0400 Subject: [PATCH 4/6] Update DocIdsWriter.java --- .../index/codec/vectors/DocIdsWriter.java | 81 ++----------------- 1 file changed, 5 insertions(+), 76 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/index/codec/vectors/DocIdsWriter.java b/server/src/main/java/org/elasticsearch/index/codec/vectors/DocIdsWriter.java index d46a6301b60a2..05ae6467b4033 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/vectors/DocIdsWriter.java +++ b/server/src/main/java/org/elasticsearch/index/codec/vectors/DocIdsWriter.java @@ -23,8 +23,6 @@ import org.apache.lucene.store.DataOutput; import org.apache.lucene.store.IndexInput; import org.apache.lucene.util.ArrayUtil; -import org.apache.lucene.util.DocBaseBitSetIterator; -import org.apache.lucene.util.FixedBitSet; import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.LongsRef; import org.apache.lucene.util.hnsw.IntToIntFunction; @@ -42,7 +40,6 @@ final class DocIdsWriter { public static final int DEFAULT_MAX_POINTS_IN_LEAF_NODE = 512; private static final byte CONTINUOUS_IDS = (byte) -2; - private static final byte BITSET_IDS = (byte) -1; private static final byte DELTA_BPV_16 = (byte) 16; private static final byte BPV_21 = (byte) 21; private static final byte BPV_24 = (byte) 24; @@ -92,21 +89,11 @@ void writeDocIds(IntToIntFunction docIds, int count, DataOutput out) throws IOEx } int min2max = max - min + 1; - if (strictlySorted) { - if (min2max == count) { - // continuous ids, typically happens when segment is sorted - out.writeByte(CONTINUOUS_IDS); - out.writeVInt(docIds.apply(0)); - return; - } else if (min2max <= (count << 4)) { - assert min2max > count : "min2max: " + min2max + ", count: " + count; - // Only trigger bitset optimization when max - min + 1 <= 16 * count in order to avoid - // expanding too much storage. - // A field with lower cardinality will have higher probability to trigger this optimization. - out.writeByte(BITSET_IDS); - writeIdsAsBitSet(docIds, count, out); - return; - } + if (strictlySorted && min2max == count) { + // continuous ids, typically happens when segment is sorted + out.writeByte(CONTINUOUS_IDS); + out.writeVInt(docIds.apply(0)); + return; } if (min2max <= 0xFFFF) { @@ -180,38 +167,6 @@ void writeDocIds(IntToIntFunction docIds, int count, DataOutput out) throws IOEx } } - private static void writeIdsAsBitSet(IntToIntFunction docIds, int count, DataOutput out) throws IOException { - int min = docIds.apply(0); - int max = docIds.apply(count - 1); - - final int offsetWords = min >> 6; - final int offsetBits = offsetWords << 6; - final int totalWordCount = FixedBitSet.bits2words(max - offsetBits + 1); - long currentWord = 0; - int currentWordIndex = 0; - - out.writeVInt(offsetWords); - out.writeVInt(totalWordCount); - // build bit set streaming - for (int i = 0; i < count; i++) { - final int index = docIds.apply(i) - offsetBits; - final int nextWordIndex = index >> 6; - assert currentWordIndex <= nextWordIndex; - if (currentWordIndex < nextWordIndex) { - out.writeLong(currentWord); - currentWord = 0L; - currentWordIndex++; - while (currentWordIndex < nextWordIndex) { - currentWordIndex++; - out.writeLong(0L); - } - } - currentWord |= 1L << index; - } - out.writeLong(currentWord); - assert currentWordIndex + 1 == totalWordCount; - } - /** Read {@code count} integers into {@code docIDs}. */ void readInts(IndexInput in, int count, int[] docIDs) throws IOException { if (count == 0) { @@ -225,9 +180,6 @@ void readInts(IndexInput in, int count, int[] docIDs) throws IOException { case CONTINUOUS_IDS: readContinuousIds(in, count, docIDs); break; - case BITSET_IDS: - readBitSet(in, count, docIDs); - break; case DELTA_BPV_16: readDelta16(in, count, docIDs); break; @@ -245,20 +197,6 @@ void readInts(IndexInput in, int count, int[] docIDs) throws IOException { } } - private DocIdSetIterator readBitSetIterator(IndexInput in, int count) throws IOException { - int offsetWords = in.readVInt(); - int longLen = in.readVInt(); - scratchLongs.longs = ArrayUtil.growNoCopy(scratchLongs.longs, longLen); - in.readLongs(scratchLongs.longs, 0, longLen); - // make ghost bits clear for FixedBitSet. - if (longLen < scratchLongs.length) { - Arrays.fill(scratchLongs.longs, longLen, scratchLongs.longs.length, 0); - } - scratchLongs.length = longLen; - FixedBitSet bitSet = new FixedBitSet(scratchLongs.longs, longLen << 6); - return new DocBaseBitSetIterator(bitSet, count, offsetWords << 6); - } - private static void readContinuousIds(IndexInput in, int count, int[] docIDs) throws IOException { int start = in.readVInt(); for (int i = 0; i < count; i++) { @@ -266,15 +204,6 @@ private static void readContinuousIds(IndexInput in, int count, int[] docIDs) th } } - private void readBitSet(IndexInput in, int count, int[] docIDs) throws IOException { - DocIdSetIterator iterator = readBitSetIterator(in, count); - int docId, pos = 0; - while ((docId = iterator.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { - docIDs[pos++] = docId; - } - assert pos == count : "pos: " + pos + ", count: " + count; - } - private static void readDelta16(IndexInput in, int count, int[] docIds) throws IOException { final int min = in.readVInt(); final int half = count >> 1; From 78643cd2de8e834fff64ec76d6de7e2ba652d19c Mon Sep 17 00:00:00 2001 From: elasticsearchmachine Date: Tue, 12 Aug 2025 14:06:46 +0000 Subject: [PATCH 5/6] [CI] Auto commit changes from spotless --- .../org/elasticsearch/index/codec/vectors/DocIdsWriter.java | 3 --- 1 file changed, 3 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/index/codec/vectors/DocIdsWriter.java b/server/src/main/java/org/elasticsearch/index/codec/vectors/DocIdsWriter.java index 05ae6467b4033..257a1340eeff1 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/vectors/DocIdsWriter.java +++ b/server/src/main/java/org/elasticsearch/index/codec/vectors/DocIdsWriter.java @@ -19,16 +19,13 @@ package org.elasticsearch.index.codec.vectors; import org.apache.lucene.index.PointValues.IntersectVisitor; -import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.store.DataOutput; import org.apache.lucene.store.IndexInput; -import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.LongsRef; import org.apache.lucene.util.hnsw.IntToIntFunction; import java.io.IOException; -import java.util.Arrays; /** * This class is used to write and read the doc ids in a compressed format. The format is optimized From 7fa53d10c78aa372c0347f1aa8ce7fa4dff5bfd3 Mon Sep 17 00:00:00 2001 From: Benjamin Trent Date: Tue, 12 Aug 2025 10:37:30 -0400 Subject: [PATCH 6/6] Update DefaultIVFVectorsWriter.java --- .../vectors/DefaultIVFVectorsWriter.java | 44 ++++++------------- 1 file changed, 14 insertions(+), 30 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/index/codec/vectors/DefaultIVFVectorsWriter.java b/server/src/main/java/org/elasticsearch/index/codec/vectors/DefaultIVFVectorsWriter.java index 444ff9cf18a0a..5e696b74530a8 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/vectors/DefaultIVFVectorsWriter.java +++ b/server/src/main/java/org/elasticsearch/index/codec/vectors/DefaultIVFVectorsWriter.java @@ -109,9 +109,9 @@ LongValues buildAndWritePostingsLists( new OptimizedScalarQuantizer(fieldInfo.getVectorSimilarityFunction()) ); final ByteBuffer buffer = ByteBuffer.allocate(fieldInfo.getVectorDimension() * Float.BYTES).order(ByteOrder.LITTLE_ENDIAN); - int[] docIds = null; - int[] docDeltas = null; - int[] clusterOrds = null; + final int[] docIds = new int[maxPostingListSize]; + final int[] docDeltas = new int[maxPostingListSize]; + final int[] clusterOrds = new int[maxPostingListSize]; DocIdsWriter idsWriter = new DocIdsWriter(); for (int c = 0; c < centroidSupplier.size(); c++) { float[] centroid = centroidSupplier.centroid(c); @@ -125,29 +125,21 @@ LongValues buildAndWritePostingsLists( int size = cluster.length; // write docIds postingsOutput.writeVInt(size); - if (docIds == null || docIds.length < cluster.length) { - docIds = new int[cluster.length]; - clusterOrds = new int[cluster.length]; - docDeltas = new int[cluster.length]; - } for (int j = 0; j < size; j++) { docIds[j] = floatVectorValues.ordToDoc(cluster[j]); clusterOrds[j] = j; } - final int[] finalDocs = docIds; - final int[] finalOrds = clusterOrds; // sort cluster.buffer by docIds values, this way cluster ordinals are sorted by docIds - new IntSorter(clusterOrds, i -> finalDocs[i]).sort(0, size); + new IntSorter(clusterOrds, i -> docIds[i]).sort(0, size); // encode doc deltas for (int j = 0; j < size; j++) { - docDeltas[j] = j == 0 ? finalDocs[finalOrds[j]] : finalDocs[finalOrds[j]] - finalDocs[finalOrds[j - 1]]; + docDeltas[j] = j == 0 ? docIds[clusterOrds[j]] : docIds[clusterOrds[j]] - docIds[clusterOrds[j - 1]]; } - final int[] finalDocDeltas = docDeltas; - onHeapQuantizedVectors.reset(centroid, size, ord -> cluster[finalOrds[ord]]); + onHeapQuantizedVectors.reset(centroid, size, ord -> cluster[clusterOrds[ord]]); // TODO we might want to consider putting the docIds in a separate file // to aid with only having to fetch vectors from slower storage when they are required // keeping them in the same file indicates we pull the entire file into cache - idsWriter.writeDocIds(i -> finalDocDeltas[i], size, postingsOutput); + idsWriter.writeDocIds(i -> docDeltas[i], size, postingsOutput); // write vectors bulkWriter.writeVectors(onHeapQuantizedVectors); } @@ -260,9 +252,9 @@ LongValues buildAndWritePostingsLists( // write the max posting list size postingsOutput.writeVInt(maxPostingListSize); // write the posting lists - int[] docIds = null; - int[] docDeltas = null; - int[] clusterOrds = null; + final int[] docIds = new int[maxPostingListSize]; + final int[] docDeltas = new int[maxPostingListSize]; + final int[] clusterOrds = new int[maxPostingListSize]; DocIdsWriter idsWriter = new DocIdsWriter(); for (int c = 0; c < centroidSupplier.size(); c++) { float[] centroid = centroidSupplier.centroid(c); @@ -277,29 +269,21 @@ LongValues buildAndWritePostingsLists( // write docIds int size = cluster.length; postingsOutput.writeVInt(size); - if (docIds == null || docIds.length < cluster.length) { - docIds = new int[cluster.length]; - clusterOrds = new int[cluster.length]; - docDeltas = new int[cluster.length]; - } for (int j = 0; j < size; j++) { docIds[j] = floatVectorValues.ordToDoc(cluster[j]); clusterOrds[j] = j; } - final int[] finalDocs = docIds; - final int[] finalOrds = clusterOrds; // sort cluster.buffer by docIds values, this way cluster ordinals are sorted by docIds - new IntSorter(clusterOrds, i -> finalDocs[i]).sort(0, size); + new IntSorter(clusterOrds, i -> docIds[i]).sort(0, size); // encode doc deltas for (int j = 0; j < size; j++) { - docDeltas[j] = j == 0 ? finalDocs[finalOrds[j]] : finalDocs[finalOrds[j]] - finalDocs[finalOrds[j - 1]]; + docDeltas[j] = j == 0 ? docIds[clusterOrds[j]] : docIds[clusterOrds[j]] - docIds[clusterOrds[j - 1]]; } - final int[] finalDocDeltas = docDeltas; - offHeapQuantizedVectors.reset(size, ord -> isOverspill[finalOrds[ord]], ord -> cluster[finalOrds[ord]]); + offHeapQuantizedVectors.reset(size, ord -> isOverspill[clusterOrds[ord]], ord -> cluster[clusterOrds[ord]]); // TODO we might want to consider putting the docIds in a separate file // to aid with only having to fetch vectors from slower storage when they are required // keeping them in the same file indicates we pull the entire file into cache - idsWriter.writeDocIds(i -> finalDocDeltas[i], size, postingsOutput); + idsWriter.writeDocIds(i -> docDeltas[i], size, postingsOutput); // write vectors bulkWriter.writeVectors(offHeapQuantizedVectors); }