Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 0 additions & 3 deletions muted-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -399,9 +399,6 @@ tests:
- class: org.elasticsearch.xpack.ccr.action.ShardFollowTaskReplicationTests
method: testChangeFollowerHistoryUUID
issue: https://github.com/elastic/elasticsearch/issues/127680
- class: org.elasticsearch.action.admin.indices.diskusage.IndexDiskUsageAnalyzerTests
method: testKnnVectors
issue: https://github.com/elastic/elasticsearch/issues/127689
- class: org.elasticsearch.backwards.MixedClusterClientYamlTestSuiteIT
method: test {p0=search/350_point_in_time/point-in-time with index filter}
issue: https://github.com/elastic/elasticsearch/issues/127741
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,16 +25,13 @@
import org.apache.lucene.codecs.TermVectorsReader;
import org.apache.lucene.codecs.lucene103.Lucene103PostingsFormat;
import org.apache.lucene.index.BinaryDocValues;
import org.apache.lucene.index.ByteVectorValues;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.DocValuesType;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.FloatVectorValues;
import org.apache.lucene.index.IndexCommit;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.KnnVectorValues;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.index.PointValues;
Expand All @@ -47,8 +44,6 @@
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.KnnCollector;
import org.apache.lucene.search.TopKnnCollector;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FilterDirectory;
import org.apache.lucene.store.IOContext;
Expand Down Expand Up @@ -553,7 +548,7 @@ void visitField(Fields vectors, String fieldName) throws IOException {
}
}

void analyzeKnnVectors(SegmentReader reader, IndexDiskUsageStats stats) throws IOException {
void analyzeKnnVectors(SegmentReader reader, IndexDiskUsageStats stats) {
KnnVectorsReader vectorReader = reader.getVectorReader();
if (vectorReader == null) {
return;
Expand All @@ -562,57 +557,19 @@ void analyzeKnnVectors(SegmentReader reader, IndexDiskUsageStats stats) throws I
cancellationChecker.checkForCancellation();
directory.resetBytesRead();
if (field.getVectorDimension() > 0) {
switch (field.getVectorEncoding()) {
case BYTE -> {
iterateDocValues(reader.maxDoc(), () -> vectorReader.getByteVectorValues(field.name).iterator(), vectors -> {
cancellationChecker.logEvent();
vectors.index();
});

// do a couple of randomized searches to figure out min and max offsets of index file
ByteVectorValues vectorValues = vectorReader.getByteVectorValues(field.name);
KnnVectorValues.DocIndexIterator iterator = vectorValues.iterator();
final KnnCollector collector = new TopKnnCollector(
Math.max(1, Math.min(100, vectorValues.size() - 1)),
Integer.MAX_VALUE
);
int numDocsToVisit = reader.maxDoc() < 10 ? reader.maxDoc() : 10 * (int) Math.log10(reader.maxDoc());
int skipFactor = Math.max(reader.maxDoc() / numDocsToVisit, 1);
for (int i = 0; i < reader.maxDoc(); i += skipFactor) {
if ((i = iterator.advance(i)) == DocIdSetIterator.NO_MORE_DOCS) {
break;
}
cancellationChecker.checkForCancellation();
vectorReader.search(field.name, vectorValues.vectorValue(iterator.index()), collector, null);
}
stats.addKnnVectors(field.name, directory.getBytesRead());
}
case FLOAT32 -> {
iterateDocValues(reader.maxDoc(), () -> vectorReader.getFloatVectorValues(field.name).iterator(), vectors -> {
cancellationChecker.logEvent();
vectors.index();
});

// do a couple of randomized searches to figure out min and max offsets of index file
FloatVectorValues vectorValues = vectorReader.getFloatVectorValues(field.name);
KnnVectorValues.DocIndexIterator iterator = vectorValues.iterator();
final KnnCollector collector = new TopKnnCollector(
Math.max(1, Math.min(100, vectorValues.size() - 1)),
Integer.MAX_VALUE
);
int numDocsToVisit = reader.maxDoc() < 10 ? reader.maxDoc() : 10 * (int) Math.log10(reader.maxDoc());
int skipFactor = Math.max(reader.maxDoc() / numDocsToVisit, 1);
for (int i = 0; i < reader.maxDoc(); i += skipFactor) {
if ((i = iterator.advance(i)) == DocIdSetIterator.NO_MORE_DOCS) {
break;
}
cancellationChecker.checkForCancellation();
vectorReader.search(field.name, vectorValues.vectorValue(iterator.index()), collector, null);
}
stats.addKnnVectors(field.name, directory.getBytesRead());
}
Map<String, Long> offHeap = vectorReader.getOffHeapByteSize(field);
long totalSize = 0;
for (var entry : offHeap.entrySet()) {
totalSize += entry.getValue();
}

long vectorsSize = offHeap.getOrDefault("vec", 0L);
if (vectorsSize == 0L) {
// This can happen if .vec file is opened with directIO
// calculate the size of vectors manually
vectorsSize = field.getVectorDimension() * field.getVectorEncoding().byteSize;
totalSize += vectorsSize;
}
stats.addKnnVectors(field.name, totalSize);
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.IntPoint;
import org.apache.lucene.document.KnnByteVectorField;
import org.apache.lucene.document.KnnFloatVectorField;
import org.apache.lucene.document.LatLonShape;
import org.apache.lucene.document.LongPoint;
Expand Down Expand Up @@ -67,6 +68,7 @@
import org.elasticsearch.common.lucene.Lucene;
import org.elasticsearch.core.IOUtils;
import org.elasticsearch.index.codec.postings.ES812PostingsFormat;
import org.elasticsearch.index.mapper.vectors.DenseVectorFieldMapper;
import org.elasticsearch.index.shard.ShardId;
import org.elasticsearch.index.store.LuceneFilesExtensions;
import org.elasticsearch.test.ESTestCase;
Expand Down Expand Up @@ -254,15 +256,27 @@ public void testKnnVectors() throws Exception {
VectorSimilarityFunction similarity = randomFrom(VectorSimilarityFunction.values());
int numDocs = between(1000, 5000);
int dimension = between(10, 200);
DenseVectorFieldMapper.ElementType elementType = randomFrom(DenseVectorFieldMapper.ElementType.values());

indexRandomly(dir, codec, numDocs, doc -> {
float[] vector = randomVector(dimension);
doc.add(new KnnFloatVectorField("vector", vector, similarity));
});
if (elementType == DenseVectorFieldMapper.ElementType.FLOAT) {
indexRandomly(dir, codec, numDocs, doc -> {
float[] vector = randomVector(dimension);
doc.add(new KnnFloatVectorField("vector", vector, similarity));
});
} else {
indexRandomly(dir, codec, numDocs, doc -> {
byte[] vector = new byte[dimension];
random().nextBytes(vector);
doc.add(new KnnByteVectorField("vector", vector, similarity));
});
}
final IndexDiskUsageStats stats = IndexDiskUsageAnalyzer.analyze(testShardId(), lastCommit(dir), () -> {});
logger.info("--> stats {}", stats);

long dataBytes = (long) numDocs * dimension * Float.BYTES; // size of flat vector data
// expected size of flat vector data
long dataBytes = elementType == DenseVectorFieldMapper.ElementType.FLOAT
? ((long) numDocs * dimension * Float.BYTES)
: ((long) numDocs * dimension);
long indexBytesEstimate = (long) numDocs * (Lucene99HnswVectorsFormat.DEFAULT_MAX_CONN / 4); // rough size of HNSW graph
assertThat("numDocs=" + numDocs + ";dimension=" + dimension, stats.total().getKnnVectorsBytes(), greaterThan(dataBytes));
long connectionOverhead = stats.total().getKnnVectorsBytes() - dataBytes;
Expand Down Expand Up @@ -762,8 +776,9 @@ private static void assertStats(IndexDiskUsageStats actualStats, IndexDiskUsageS
0.01,
2048
);

assertFieldStats(field, "knn vectors", actualField.getKnnVectorsBytes(), expectedField.getKnnVectorsBytes(), 0.01, 1024);
// Allow difference of a file block size for knn vectors
// we get knn data usage from getOffHeapByteSize but when written on disk it can be rounded to the next block size
assertFieldStats(field, "knn vectors", actualField.getKnnVectorsBytes(), expectedField.getKnnVectorsBytes(), 0.01, 4096);
}
// We are not able to collect per field stats for stored, vector, points, and norms
IndexDiskUsageStats.PerFieldDiskUsage actualTotal = actualStats.total();
Expand Down