Skip to content

Commit 4d2841e

Browse files
committed
Sort postings lists by doc ID
1 parent de4245c commit 4d2841e

File tree

2 files changed

+86
-7
lines changed

2 files changed

+86
-7
lines changed

server/src/main/java/org/elasticsearch/index/codec/vectors/DefaultIVFVectorsReader.java

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
import org.apache.lucene.search.KnnCollector;
1717
import org.apache.lucene.store.IndexInput;
1818
import org.apache.lucene.util.ArrayUtil;
19+
import org.apache.lucene.util.GroupVIntUtil;
1920
import org.apache.lucene.util.VectorUtil;
2021
import org.apache.lucene.util.hnsw.NeighborQueue;
2122
import org.elasticsearch.index.codec.vectors.reflect.OffHeapStats;
@@ -370,7 +371,13 @@ public int resetPostingsScorer(long offset) throws IOException {
370371
vectors = indexInput.readVInt();
371372
// read the doc ids
372373
assert vectors <= docIdsScratch.length;
373-
docIdsWriter.readInts(indexInput, vectors, docIdsScratch);
374+
GroupVIntUtil.readGroupVInts(indexInput, docIdsScratch, vectors);
375+
// reconstitute from the deltas
376+
int sum = 0;
377+
for (int i = 1; i < vectors; i++) {
378+
sum += docIdsScratch[i];
379+
docIdsScratch[i] = sum;
380+
}
374381
slicePos = indexInput.getFilePointer();
375382
return vectors;
376383
}

server/src/main/java/org/elasticsearch/index/codec/vectors/DefaultIVFVectorsWriter.java

Lines changed: 78 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
import org.apache.lucene.store.IOContext;
1818
import org.apache.lucene.store.IndexInput;
1919
import org.apache.lucene.store.IndexOutput;
20+
import org.apache.lucene.util.IntroSorter;
2021
import org.apache.lucene.util.LongValues;
2122
import org.apache.lucene.util.VectorUtil;
2223
import org.apache.lucene.util.hnsw.IntToIntFunction;
@@ -101,14 +102,16 @@ LongValues buildAndWritePostingsLists(
101102
postingsOutput.writeVInt(maxPostingListSize);
102103
// write the posting lists
103104
final PackedLongValues.Builder offsets = PackedLongValues.monotonicBuilder(PackedInts.COMPACT);
104-
DocIdsWriter docIdsWriter = new DocIdsWriter();
105105
DiskBBQBulkWriter bulkWriter = new DiskBBQBulkWriter.OneBitDiskBBQBulkWriter(ES91OSQVectorsScorer.BULK_SIZE, postingsOutput);
106106
OnHeapQuantizedVectors onHeapQuantizedVectors = new OnHeapQuantizedVectors(
107107
floatVectorValues,
108108
fieldInfo.getVectorDimension(),
109109
new OptimizedScalarQuantizer(fieldInfo.getVectorSimilarityFunction())
110110
);
111111
final ByteBuffer buffer = ByteBuffer.allocate(fieldInfo.getVectorDimension() * Float.BYTES).order(ByteOrder.LITTLE_ENDIAN);
112+
int[] docIds = null;
113+
int[] docDeltas = null;
114+
int[] clusterOrds = null;
112115
for (int c = 0; c < centroidSupplier.size(); c++) {
113116
float[] centroid = centroidSupplier.centroid(c);
114117
int[] cluster = assignmentsByCluster[c];
@@ -121,11 +124,28 @@ LongValues buildAndWritePostingsLists(
121124
int size = cluster.length;
122125
// write docIds
123126
postingsOutput.writeVInt(size);
124-
onHeapQuantizedVectors.reset(centroid, size, ord -> cluster[ord]);
127+
if (docIds == null || docIds.length < cluster.length) {
128+
docIds = new int[cluster.length];
129+
clusterOrds = new int[cluster.length];
130+
docDeltas = new int[cluster.length];
131+
}
132+
for (int j = 0; j < size; j++) {
133+
docIds[j] = floatVectorValues.ordToDoc(cluster[j]);
134+
clusterOrds[j] = j;
135+
}
136+
final int[] finalDocs = docIds;
137+
final int[] finalOrds = clusterOrds;
138+
// sort cluster.buffer by docIds values, this way cluster ordinals are sorted by docIds
139+
new IntSorter(clusterOrds, i -> finalDocs[i]).sort(0, size);
140+
// encode doc deltas
141+
for (int j = 0; j < size; j++) {
142+
docDeltas[j] = j == 0 ? finalDocs[finalOrds[j]] : finalDocs[finalOrds[j]] - finalDocs[finalOrds[j - 1]];
143+
}
144+
onHeapQuantizedVectors.reset(centroid, size, ord -> cluster[finalOrds[ord]]);
125145
// TODO we might want to consider putting the docIds in a separate file
126146
// to aid with only having to fetch vectors from slower storage when they are required
127147
// keeping them in the same file indicates we pull the entire file into cache
128-
docIdsWriter.writeDocIds(j -> floatVectorValues.ordToDoc(cluster[j]), size, postingsOutput);
148+
postingsOutput.writeGroupVInts(docDeltas, size);
129149
// write vectors
130150
bulkWriter.writeVectors(onHeapQuantizedVectors);
131151
}
@@ -233,12 +253,14 @@ LongValues buildAndWritePostingsLists(
233253
quantizedVectorsInput,
234254
fieldInfo.getVectorDimension()
235255
);
236-
DocIdsWriter docIdsWriter = new DocIdsWriter();
237256
DiskBBQBulkWriter bulkWriter = new DiskBBQBulkWriter.OneBitDiskBBQBulkWriter(ES91OSQVectorsScorer.BULK_SIZE, postingsOutput);
238257
final ByteBuffer buffer = ByteBuffer.allocate(fieldInfo.getVectorDimension() * Float.BYTES).order(ByteOrder.LITTLE_ENDIAN);
239258
// write the max posting list size
240259
postingsOutput.writeVInt(maxPostingListSize);
241260
// write the posting lists
261+
int[] docIds = null;
262+
int[] docDeltas = null;
263+
int[] clusterOrds = null;
242264
for (int c = 0; c < centroidSupplier.size(); c++) {
243265
float[] centroid = centroidSupplier.centroid(c);
244266
int[] cluster = assignmentsByCluster[c];
@@ -252,11 +274,28 @@ LongValues buildAndWritePostingsLists(
252274
// write docIds
253275
int size = cluster.length;
254276
postingsOutput.writeVInt(size);
255-
offHeapQuantizedVectors.reset(size, ord -> isOverspill[ord], ord -> cluster[ord]);
277+
if (docIds == null || docIds.length < cluster.length) {
278+
docIds = new int[cluster.length];
279+
clusterOrds = new int[cluster.length];
280+
docDeltas = new int[cluster.length];
281+
}
282+
for (int j = 0; j < size; j++) {
283+
docIds[j] = floatVectorValues.ordToDoc(cluster[j]);
284+
clusterOrds[j] = j;
285+
}
286+
final int[] finalDocs = docIds;
287+
final int[] finalOrds = clusterOrds;
288+
// sort cluster.buffer by docIds values, this way cluster ordinals are sorted by docIds
289+
new IntSorter(clusterOrds, i -> finalDocs[i]).sort(0, size);
290+
// encode doc deltas
291+
for (int j = 0; j < size; j++) {
292+
docDeltas[j] = j == 0 ? finalDocs[finalOrds[j]] : finalDocs[finalOrds[j]] - finalDocs[finalOrds[j - 1]];
293+
}
294+
offHeapQuantizedVectors.reset(size, ord -> isOverspill[finalOrds[ord]], ord -> cluster[finalOrds[ord]]);
256295
// TODO we might want to consider putting the docIds in a separate file
257296
// to aid with only having to fetch vectors from slower storage when they are required
258297
// keeping them in the same file indicates we pull the entire file into cache
259-
docIdsWriter.writeDocIds(j -> floatVectorValues.ordToDoc(cluster[j]), size, postingsOutput);
298+
postingsOutput.writeGroupVInts(docDeltas, size);
260299
// write vectors
261300
bulkWriter.writeVectors(offHeapQuantizedVectors);
262301
}
@@ -717,4 +756,37 @@ public void readQuantizedVector(int ord, boolean isOverspill) throws IOException
717756
bitSum = quantizedVectorsInput.readShort();
718757
}
719758
}
759+
760+
private static class IntSorter extends IntroSorter {
761+
int pivot = -1;
762+
private final int[] arr;
763+
private final IntToIntFunction func;
764+
765+
private IntSorter(int[] arr, IntToIntFunction func) {
766+
this.arr = arr;
767+
this.func = func;
768+
}
769+
770+
@Override
771+
protected void setPivot(int i) {
772+
pivot = func.apply(arr[i]);
773+
}
774+
775+
@Override
776+
protected int comparePivot(int j) {
777+
return Integer.compare(pivot, func.apply(arr[j]));
778+
}
779+
780+
@Override
781+
protected int compare(int a, int b) {
782+
return Integer.compare(func.apply(arr[a]), func.apply(arr[b]));
783+
}
784+
785+
@Override
786+
protected void swap(int i, int j) {
787+
final int tmp = arr[i];
788+
arr[i] = arr[j];
789+
arr[j] = tmp;
790+
}
791+
}
720792
}

0 commit comments

Comments
 (0)