Skip to content

Commit 98fb570

Browse files
committed
Merge remote-tracking branch 'upstream/main' into structured_source
2 parents 4cda678 + 3ccb52d commit 98fb570

File tree

67 files changed

+3421
-1968
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

67 files changed

+3421
-1968
lines changed

docs/changelog/132506.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
pr: 132506
2+
summary: Add .integration_knowledge system index for usage by AI assistants
3+
area: Infra/Core
4+
type: feature
5+
issues: []

muted-tests.yml

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -543,9 +543,6 @@ tests:
543543
- class: org.elasticsearch.test.rest.yaml.CcsCommonYamlTestSuiteIT
544544
method: test {p0=search/160_exists_query/Test exists query on unmapped float field}
545545
issue: https://github.com/elastic/elasticsearch/issues/132984
546-
- class: org.elasticsearch.xpack.esql.analysis.AnalyzerTests
547-
method: testMagnitudePlanWithDenseVectorImplicitCasting
548-
issue: https://github.com/elastic/elasticsearch/issues/132985
549546
- class: org.elasticsearch.xpack.search.AsyncSearchErrorTraceIT
550547
method: testAsyncSearchFailingQueryErrorTraceDefault
551548
issue: https://github.com/elastic/elasticsearch/issues/133010
@@ -582,9 +579,6 @@ tests:
582579
- class: org.elasticsearch.test.rest.yaml.CcsCommonYamlTestSuiteIT
583580
method: test {p0=search/160_exists_query/Test exists query on _id field}
584581
issue: https://github.com/elastic/elasticsearch/issues/133097
585-
- class: org.elasticsearch.xpack.esql.analysis.AnalyzerTests
586-
method: testNoDenseVectorFailsForMagnitude
587-
issue: https://github.com/elastic/elasticsearch/issues/133013
588582
- class: org.elasticsearch.xpack.ml.integration.TextEmbeddingQueryIT
589583
method: testModelWithPrefixStrings
590584
issue: https://github.com/elastic/elasticsearch/issues/133138

server/src/main/java/org/elasticsearch/index/codec/vectors/DefaultIVFVectorsReader.java

Lines changed: 67 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -47,9 +47,50 @@ public DefaultIVFVectorsReader(SegmentReadState state, FlatVectorsReader rawVect
4747
super(state, rawVectorsReader);
4848
}
4949

50+
CentroidIterator getPostingListPrefetchIterator(CentroidIterator centroidIterator, IndexInput postingListSlice) throws IOException {
51+
return new CentroidIterator() {
52+
CentroidOffsetAndLength nextOffsetAndLength = centroidIterator.hasNext()
53+
? centroidIterator.nextPostingListOffsetAndLength()
54+
: null;
55+
56+
{
57+
// prefetch the first one
58+
if (nextOffsetAndLength != null) {
59+
prefetch(nextOffsetAndLength);
60+
}
61+
}
62+
63+
void prefetch(CentroidOffsetAndLength offsetAndLength) throws IOException {
64+
postingListSlice.prefetch(offsetAndLength.offset(), offsetAndLength.length());
65+
}
66+
67+
@Override
68+
public boolean hasNext() {
69+
return nextOffsetAndLength != null;
70+
}
71+
72+
@Override
73+
public CentroidOffsetAndLength nextPostingListOffsetAndLength() throws IOException {
74+
CentroidOffsetAndLength offsetAndLength = nextOffsetAndLength;
75+
if (centroidIterator.hasNext()) {
76+
nextOffsetAndLength = centroidIterator.nextPostingListOffsetAndLength();
77+
prefetch(nextOffsetAndLength);
78+
} else {
79+
nextOffsetAndLength = null; // indicate we reached the end
80+
}
81+
return offsetAndLength;
82+
}
83+
};
84+
}
85+
5086
@Override
51-
CentroidIterator getCentroidIterator(FieldInfo fieldInfo, int numCentroids, IndexInput centroids, float[] targetQuery)
52-
throws IOException {
87+
CentroidIterator getCentroidIterator(
88+
FieldInfo fieldInfo,
89+
int numCentroids,
90+
IndexInput centroids,
91+
float[] targetQuery,
92+
IndexInput postingListSlice
93+
) throws IOException {
5394
final FieldEntry fieldEntry = fields.get(fieldInfo.number);
5495
final float globalCentroidDp = fieldEntry.globalCentroidDp();
5596
final OptimizedScalarQuantizer scalarQuantizer = new OptimizedScalarQuantizer(fieldInfo.getVectorSimilarityFunction());
@@ -71,8 +112,9 @@ CentroidIterator getCentroidIterator(FieldInfo fieldInfo, int numCentroids, Inde
71112
final ES92Int7VectorsScorer scorer = ESVectorUtil.getES92Int7VectorsScorer(centroids, fieldInfo.getVectorDimension());
72113
centroids.seek(0L);
73114
int numParents = centroids.readVInt();
115+
CentroidIterator centroidIterator;
74116
if (numParents > 0) {
75-
return getCentroidIteratorWithParents(
117+
centroidIterator = getCentroidIteratorWithParents(
76118
fieldInfo,
77119
centroids,
78120
numParents,
@@ -82,8 +124,18 @@ CentroidIterator getCentroidIterator(FieldInfo fieldInfo, int numCentroids, Inde
82124
queryParams,
83125
globalCentroidDp
84126
);
127+
} else {
128+
centroidIterator = getCentroidIteratorNoParent(
129+
fieldInfo,
130+
centroids,
131+
numCentroids,
132+
scorer,
133+
quantized,
134+
queryParams,
135+
globalCentroidDp
136+
);
85137
}
86-
return getCentroidIteratorNoParent(fieldInfo, centroids, numCentroids, scorer, quantized, queryParams, globalCentroidDp);
138+
return getPostingListPrefetchIterator(centroidIterator, postingListSlice);
87139
}
88140

89141
private static CentroidIterator getCentroidIteratorNoParent(
@@ -115,10 +167,12 @@ public boolean hasNext() {
115167
}
116168

117169
@Override
118-
public long nextPostingListOffset() throws IOException {
170+
public CentroidOffsetAndLength nextPostingListOffsetAndLength() throws IOException {
119171
int centroidOrdinal = neighborQueue.pop();
120-
centroids.seek(offset + (long) Long.BYTES * centroidOrdinal);
121-
return centroids.readLong();
172+
centroids.seek(offset + (long) Long.BYTES * 2 * centroidOrdinal);
173+
long postingListOffset = centroids.readLong();
174+
long postingListLength = centroids.readLong();
175+
return new CentroidOffsetAndLength(postingListOffset, postingListLength);
122176
}
123177
};
124178
}
@@ -185,11 +239,13 @@ public boolean hasNext() {
185239
}
186240

187241
@Override
188-
public long nextPostingListOffset() throws IOException {
242+
public CentroidOffsetAndLength nextPostingListOffsetAndLength() throws IOException {
189243
int centroidOrdinal = neighborQueue.pop();
190244
updateQueue(); // add one children if available so the queue remains fully populated
191-
centroids.seek(childrenFileOffsets + (long) Long.BYTES * centroidOrdinal);
192-
return centroids.readLong();
245+
centroids.seek(childrenFileOffsets + (long) Long.BYTES * 2 * centroidOrdinal);
246+
long postingListOffset = centroids.readLong();
247+
long postingListLength = centroids.readLong();
248+
return new CentroidOffsetAndLength(postingListOffset, postingListLength);
193249
}
194250

195251
private void updateQueue() throws IOException {
@@ -452,6 +508,7 @@ public int visit(KnnCollector knnCollector) throws IOException {
452508
int scoredDocs = 0;
453509
int limit = vectors - BULK_SIZE + 1;
454510
int i = 0;
511+
455512
for (; i < limit; i += BULK_SIZE) {
456513
final int docsToBulkScore = acceptDocs == null ? BULK_SIZE : docToBulkScore(docIdsScratch, i, acceptDocs);
457514
if (docsToBulkScore == 0) {

server/src/main/java/org/elasticsearch/index/codec/vectors/DefaultIVFVectorsWriter.java

Lines changed: 22 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@
1818
import org.apache.lucene.store.IndexInput;
1919
import org.apache.lucene.store.IndexOutput;
2020
import org.apache.lucene.util.IntroSorter;
21-
import org.apache.lucene.util.LongValues;
2221
import org.apache.lucene.util.VectorUtil;
2322
import org.apache.lucene.util.hnsw.IntToIntFunction;
2423
import org.apache.lucene.util.packed.PackedInts;
@@ -60,7 +59,7 @@ public DefaultIVFVectorsWriter(
6059
}
6160

6261
@Override
63-
LongValues buildAndWritePostingsLists(
62+
CentroidOffsetAndLength buildAndWritePostingsLists(
6463
FieldInfo fieldInfo,
6564
CentroidSupplier centroidSupplier,
6665
FloatVectorValues floatVectorValues,
@@ -102,6 +101,7 @@ LongValues buildAndWritePostingsLists(
102101
postingsOutput.writeVInt(maxPostingListSize);
103102
// write the posting lists
104103
final PackedLongValues.Builder offsets = PackedLongValues.monotonicBuilder(PackedInts.COMPACT);
104+
final PackedLongValues.Builder lengths = PackedLongValues.monotonicBuilder(PackedInts.COMPACT);
105105
DiskBBQBulkWriter bulkWriter = new DiskBBQBulkWriter.OneBitDiskBBQBulkWriter(ES91OSQVectorsScorer.BULK_SIZE, postingsOutput);
106106
OnHeapQuantizedVectors onHeapQuantizedVectors = new OnHeapQuantizedVectors(
107107
floatVectorValues,
@@ -116,7 +116,8 @@ LongValues buildAndWritePostingsLists(
116116
for (int c = 0; c < centroidSupplier.size(); c++) {
117117
float[] centroid = centroidSupplier.centroid(c);
118118
int[] cluster = assignmentsByCluster[c];
119-
offsets.add(postingsOutput.alignFilePointer(Float.BYTES) - fileOffset);
119+
long offset = postingsOutput.alignFilePointer(Float.BYTES) - fileOffset;
120+
offsets.add(offset);
120121
buffer.asFloatBuffer().put(centroid);
121122
// write raw centroid for quantizing the query vectors
122123
postingsOutput.writeBytes(buffer.array(), buffer.array().length);
@@ -142,17 +143,18 @@ LongValues buildAndWritePostingsLists(
142143
idsWriter.writeDocIds(i -> docDeltas[i], size, postingsOutput);
143144
// write vectors
144145
bulkWriter.writeVectors(onHeapQuantizedVectors);
146+
lengths.add(postingsOutput.getFilePointer() - fileOffset - offset);
145147
}
146148

147149
if (logger.isDebugEnabled()) {
148150
printClusterQualityStatistics(assignmentsByCluster);
149151
}
150152

151-
return offsets.build();
153+
return new CentroidOffsetAndLength(offsets.build(), lengths.build());
152154
}
153155

154156
@Override
155-
LongValues buildAndWritePostingsLists(
157+
CentroidOffsetAndLength buildAndWritePostingsLists(
156158
FieldInfo fieldInfo,
157159
CentroidSupplier centroidSupplier,
158160
FloatVectorValues floatVectorValues,
@@ -243,6 +245,7 @@ LongValues buildAndWritePostingsLists(
243245
// now we can read the quantized vectors from the temporary file
244246
try (IndexInput quantizedVectorsInput = mergeState.segmentInfo.dir.openInput(quantizedVectorsTempName, IOContext.DEFAULT)) {
245247
final PackedLongValues.Builder offsets = PackedLongValues.monotonicBuilder(PackedInts.COMPACT);
248+
final PackedLongValues.Builder lengths = PackedLongValues.monotonicBuilder(PackedInts.COMPACT);
246249
OffHeapQuantizedVectors offHeapQuantizedVectors = new OffHeapQuantizedVectors(
247250
quantizedVectorsInput,
248251
fieldInfo.getVectorDimension()
@@ -260,7 +263,8 @@ LongValues buildAndWritePostingsLists(
260263
float[] centroid = centroidSupplier.centroid(c);
261264
int[] cluster = assignmentsByCluster[c];
262265
boolean[] isOverspill = isOverspillByCluster[c];
263-
offsets.add(postingsOutput.alignFilePointer(Float.BYTES) - fileOffset);
266+
long offset = postingsOutput.alignFilePointer(Float.BYTES) - fileOffset;
267+
offsets.add(offset);
264268
// write raw centroid for quantizing the query vectors
265269
buffer.asFloatBuffer().put(centroid);
266270
postingsOutput.writeBytes(buffer.array(), buffer.array().length);
@@ -286,12 +290,14 @@ LongValues buildAndWritePostingsLists(
286290
idsWriter.writeDocIds(i -> docDeltas[i], size, postingsOutput);
287291
// write vectors
288292
bulkWriter.writeVectors(offHeapQuantizedVectors);
293+
lengths.add(postingsOutput.getFilePointer() - fileOffset - offset);
294+
// lengths.add(1);
289295
}
290296

291297
if (logger.isDebugEnabled()) {
292298
printClusterQualityStatistics(assignmentsByCluster);
293299
}
294-
return offsets.build();
300+
return new CentroidOffsetAndLength(offsets.build(), lengths.build());
295301
}
296302
}
297303

@@ -335,24 +341,24 @@ void writeCentroids(
335341
FieldInfo fieldInfo,
336342
CentroidSupplier centroidSupplier,
337343
float[] globalCentroid,
338-
LongValues offsets,
344+
CentroidOffsetAndLength centroidOffsetAndLength,
339345
IndexOutput centroidOutput
340346
) throws IOException {
341347
// TODO do we want to store these distances as well for future use?
342348
// TODO: sort centroids by global centroid (was doing so previously here)
343349
// TODO: sorting tanks recall possibly because centroids ordinals no longer are aligned
344350
if (centroidSupplier.size() > centroidsPerParentCluster * centroidsPerParentCluster) {
345-
writeCentroidsWithParents(fieldInfo, centroidSupplier, globalCentroid, offsets, centroidOutput);
351+
writeCentroidsWithParents(fieldInfo, centroidSupplier, globalCentroid, centroidOffsetAndLength, centroidOutput);
346352
} else {
347-
writeCentroidsWithoutParents(fieldInfo, centroidSupplier, globalCentroid, offsets, centroidOutput);
353+
writeCentroidsWithoutParents(fieldInfo, centroidSupplier, globalCentroid, centroidOffsetAndLength, centroidOutput);
348354
}
349355
}
350356

351357
private void writeCentroidsWithParents(
352358
FieldInfo fieldInfo,
353359
CentroidSupplier centroidSupplier,
354360
float[] globalCentroid,
355-
LongValues offsets,
361+
CentroidOffsetAndLength centroidOffsetAndLength,
356362
IndexOutput centroidOutput
357363
) throws IOException {
358364
DiskBBQBulkWriter.SevenBitDiskBBQBulkWriter bulkWriter = new DiskBBQBulkWriter.SevenBitDiskBBQBulkWriter(
@@ -392,7 +398,8 @@ private void writeCentroidsWithParents(
392398
for (int i = 0; i < centroidGroups.centroids().length; i++) {
393399
final int[] centroidAssignments = centroidGroups.vectors()[i];
394400
for (int assignment : centroidAssignments) {
395-
centroidOutput.writeLong(offsets.get(assignment));
401+
centroidOutput.writeLong(centroidOffsetAndLength.offsets().get(assignment));
402+
centroidOutput.writeLong(centroidOffsetAndLength.lengths().get(assignment));
396403
}
397404
}
398405
}
@@ -401,7 +408,7 @@ private void writeCentroidsWithoutParents(
401408
FieldInfo fieldInfo,
402409
CentroidSupplier centroidSupplier,
403410
float[] globalCentroid,
404-
LongValues offsets,
411+
CentroidOffsetAndLength centroidOffsetAndLength,
405412
IndexOutput centroidOutput
406413
) throws IOException {
407414
centroidOutput.writeVInt(0);
@@ -419,7 +426,8 @@ private void writeCentroidsWithoutParents(
419426
bulkWriter.writeVectors(quantizedCentroids);
420427
// write the centroid offsets at the end of the file
421428
for (int i = 0; i < centroidSupplier.size(); i++) {
422-
centroidOutput.writeLong(offsets.get(i));
429+
centroidOutput.writeLong(centroidOffsetAndLength.offsets().get(i));
430+
centroidOutput.writeLong(centroidOffsetAndLength.lengths().get(i));
423431
}
424432
}
425433

server/src/main/java/org/elasticsearch/index/codec/vectors/IVFVectorsReader.java

Lines changed: 25 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -85,8 +85,13 @@ protected IVFVectorsReader(SegmentReadState state, FlatVectorsReader rawVectorsR
8585
}
8686
}
8787

88-
abstract CentroidIterator getCentroidIterator(FieldInfo fieldInfo, int numCentroids, IndexInput centroids, float[] target)
89-
throws IOException;
88+
abstract CentroidIterator getCentroidIterator(
89+
FieldInfo fieldInfo,
90+
int numCentroids,
91+
IndexInput centroids,
92+
float[] target,
93+
IndexInput postingListSlice
94+
) throws IOException;
9095

9196
private static IndexInput openDataInput(
9297
SegmentReadState state,
@@ -241,31 +246,37 @@ public final void search(String field, float[] target, KnnCollector knnCollector
241246
}
242247
// we account for soar vectors here. We can potentially visit a vector twice so we multiply by 2 here.
243248
long maxVectorVisited = (long) (2.0 * visitRatio * numVectors);
244-
CentroidIterator centroidIterator = getCentroidIterator(fieldInfo, entry.numCentroids, entry.centroidSlice(ivfCentroids), target);
245-
PostingVisitor scorer = getPostingVisitor(fieldInfo, entry.postingListSlice(ivfClusters), target, acceptDocs);
246-
249+
IndexInput postListSlice = entry.postingListSlice(ivfClusters);
250+
CentroidIterator centroidPrefetchingIterator = getCentroidIterator(
251+
fieldInfo,
252+
entry.numCentroids,
253+
entry.centroidSlice(ivfCentroids),
254+
target,
255+
postListSlice
256+
);
257+
PostingVisitor scorer = getPostingVisitor(fieldInfo, postListSlice, target, acceptDocs);
247258
long expectedDocs = 0;
248259
long actualDocs = 0;
249260
// initially we visit only the "centroids to search"
250261
// Note, numCollected is doing the bare minimum here.
251262
// TODO do we need to handle nested doc counts similarly to how we handle
252263
// filtering? E.g. keep exploring until we hit an expected number of parent documents vs. child vectors?
253-
while (centroidIterator.hasNext()
264+
while (centroidPrefetchingIterator.hasNext()
254265
&& (maxVectorVisited > expectedDocs || knnCollector.minCompetitiveSimilarity() == Float.NEGATIVE_INFINITY)) {
255266
// todo do we actually need to know the score???
256-
long offset = centroidIterator.nextPostingListOffset();
267+
CentroidOffsetAndLength offsetAndLength = centroidPrefetchingIterator.nextPostingListOffsetAndLength();
257268
// todo do we need direct access to the raw centroid???, this is used for quantizing, maybe hydrating and quantizing
258269
// is enough?
259-
expectedDocs += scorer.resetPostingsScorer(offset);
270+
expectedDocs += scorer.resetPostingsScorer(offsetAndLength.offset());
260271
actualDocs += scorer.visit(knnCollector);
261272
}
262273
if (acceptDocs != null) {
263274
float unfilteredRatioVisited = (float) expectedDocs / numVectors;
264275
int filteredVectors = (int) Math.ceil(numVectors * percentFiltered);
265276
float expectedScored = Math.min(2 * filteredVectors * unfilteredRatioVisited, expectedDocs / 2f);
266-
while (centroidIterator.hasNext() && (actualDocs < expectedScored || actualDocs < knnCollector.k())) {
267-
long offset = centroidIterator.nextPostingListOffset();
268-
scorer.resetPostingsScorer(offset);
277+
while (centroidPrefetchingIterator.hasNext() && (actualDocs < expectedScored || actualDocs < knnCollector.k())) {
278+
CentroidOffsetAndLength offsetAndLength = centroidPrefetchingIterator.nextPostingListOffsetAndLength();
279+
scorer.resetPostingsScorer(offsetAndLength.offset());
269280
actualDocs += scorer.visit(knnCollector);
270281
}
271282
}
@@ -312,10 +323,12 @@ IndexInput postingListSlice(IndexInput postingListFile) throws IOException {
312323
abstract PostingVisitor getPostingVisitor(FieldInfo fieldInfo, IndexInput postingsLists, float[] target, Bits needsScoring)
313324
throws IOException;
314325

326+
record CentroidOffsetAndLength(long offset, long length) {}
327+
315328
interface CentroidIterator {
316329
boolean hasNext();
317330

318-
long nextPostingListOffset() throws IOException;
331+
CentroidOffsetAndLength nextPostingListOffsetAndLength() throws IOException;
319332
}
320333

321334
interface PostingVisitor {

0 commit comments

Comments
 (0)