Skip to content

Commit 9d6d677

Browse files
Track vector disk usage by vectorReader.getOffHeapByteSize
Currently IndexDiskUsageAnalyzer reports disk usage of vectors by - Iterating through document values to access vector data - Performing sample searches to force loading of the index structures - using a sampling approach (only visiting a subset of documents based on log scale) - tracking all bytes read during these operations One problem of this approach is that it is very slow. Another problem is that modifications to search algorithms and different encodings make it difficult to write definite test and assert expected results, hence a test failure such as #127689. This modifies IndexDiskUsageAnalyzer for vectors to use a new introduced in Lucene 10.3 method vectorReader.getOffHeapByteSize. As all vector files are offHeap, we can rely on this method to report the precise disk usage. Closes #127689
1 parent 26b66b1 commit 9d6d677

File tree

2 files changed

+5
-58
lines changed

2 files changed

+5
-58
lines changed

muted-tests.yml

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -399,9 +399,6 @@ tests:
399399
- class: org.elasticsearch.xpack.ccr.action.ShardFollowTaskReplicationTests
400400
method: testChangeFollowerHistoryUUID
401401
issue: https://github.com/elastic/elasticsearch/issues/127680
402-
- class: org.elasticsearch.action.admin.indices.diskusage.IndexDiskUsageAnalyzerTests
403-
method: testKnnVectors
404-
issue: https://github.com/elastic/elasticsearch/issues/127689
405402
- class: org.elasticsearch.backwards.MixedClusterClientYamlTestSuiteIT
406403
method: test {p0=search/350_point_in_time/point-in-time with index filter}
407404
issue: https://github.com/elastic/elasticsearch/issues/127741

server/src/main/java/org/elasticsearch/action/admin/indices/diskusage/IndexDiskUsageAnalyzer.java

Lines changed: 5 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -25,16 +25,13 @@
2525
import org.apache.lucene.codecs.TermVectorsReader;
2626
import org.apache.lucene.codecs.lucene103.Lucene103PostingsFormat;
2727
import org.apache.lucene.index.BinaryDocValues;
28-
import org.apache.lucene.index.ByteVectorValues;
2928
import org.apache.lucene.index.DirectoryReader;
3029
import org.apache.lucene.index.DocValuesType;
3130
import org.apache.lucene.index.FieldInfo;
3231
import org.apache.lucene.index.FieldInfos;
3332
import org.apache.lucene.index.Fields;
34-
import org.apache.lucene.index.FloatVectorValues;
3533
import org.apache.lucene.index.IndexCommit;
3634
import org.apache.lucene.index.IndexOptions;
37-
import org.apache.lucene.index.KnnVectorValues;
3835
import org.apache.lucene.index.LeafReaderContext;
3936
import org.apache.lucene.index.NumericDocValues;
4037
import org.apache.lucene.index.PointValues;
@@ -47,8 +44,6 @@
4744
import org.apache.lucene.index.Terms;
4845
import org.apache.lucene.index.TermsEnum;
4946
import org.apache.lucene.search.DocIdSetIterator;
50-
import org.apache.lucene.search.KnnCollector;
51-
import org.apache.lucene.search.TopKnnCollector;
5247
import org.apache.lucene.store.Directory;
5348
import org.apache.lucene.store.FilterDirectory;
5449
import org.apache.lucene.store.IOContext;
@@ -562,57 +557,12 @@ void analyzeKnnVectors(SegmentReader reader, IndexDiskUsageStats stats) throws I
562557
cancellationChecker.checkForCancellation();
563558
directory.resetBytesRead();
564559
if (field.getVectorDimension() > 0) {
565-
switch (field.getVectorEncoding()) {
566-
case BYTE -> {
567-
iterateDocValues(reader.maxDoc(), () -> vectorReader.getByteVectorValues(field.name).iterator(), vectors -> {
568-
cancellationChecker.logEvent();
569-
vectors.index();
570-
});
571-
572-
// do a couple of randomized searches to figure out min and max offsets of index file
573-
ByteVectorValues vectorValues = vectorReader.getByteVectorValues(field.name);
574-
KnnVectorValues.DocIndexIterator iterator = vectorValues.iterator();
575-
final KnnCollector collector = new TopKnnCollector(
576-
Math.max(1, Math.min(100, vectorValues.size() - 1)),
577-
Integer.MAX_VALUE
578-
);
579-
int numDocsToVisit = reader.maxDoc() < 10 ? reader.maxDoc() : 10 * (int) Math.log10(reader.maxDoc());
580-
int skipFactor = Math.max(reader.maxDoc() / numDocsToVisit, 1);
581-
for (int i = 0; i < reader.maxDoc(); i += skipFactor) {
582-
if ((i = iterator.advance(i)) == DocIdSetIterator.NO_MORE_DOCS) {
583-
break;
584-
}
585-
cancellationChecker.checkForCancellation();
586-
vectorReader.search(field.name, vectorValues.vectorValue(iterator.index()), collector, null);
587-
}
588-
stats.addKnnVectors(field.name, directory.getBytesRead());
589-
}
590-
case FLOAT32 -> {
591-
iterateDocValues(reader.maxDoc(), () -> vectorReader.getFloatVectorValues(field.name).iterator(), vectors -> {
592-
cancellationChecker.logEvent();
593-
vectors.index();
594-
});
595-
596-
// do a couple of randomized searches to figure out min and max offsets of index file
597-
FloatVectorValues vectorValues = vectorReader.getFloatVectorValues(field.name);
598-
KnnVectorValues.DocIndexIterator iterator = vectorValues.iterator();
599-
final KnnCollector collector = new TopKnnCollector(
600-
Math.max(1, Math.min(100, vectorValues.size() - 1)),
601-
Integer.MAX_VALUE
602-
);
603-
int numDocsToVisit = reader.maxDoc() < 10 ? reader.maxDoc() : 10 * (int) Math.log10(reader.maxDoc());
604-
int skipFactor = Math.max(reader.maxDoc() / numDocsToVisit, 1);
605-
for (int i = 0; i < reader.maxDoc(); i += skipFactor) {
606-
if ((i = iterator.advance(i)) == DocIdSetIterator.NO_MORE_DOCS) {
607-
break;
608-
}
609-
cancellationChecker.checkForCancellation();
610-
vectorReader.search(field.name, vectorValues.vectorValue(iterator.index()), collector, null);
611-
}
612-
stats.addKnnVectors(field.name, directory.getBytesRead());
613-
}
560+
Map<String, Long> offHeap = vectorReader.getOffHeapByteSize(field);
561+
long totalSize = 0;
562+
for (var entry : offHeap.entrySet()) {
563+
totalSize += entry.getValue();
614564
}
615-
565+
stats.addKnnVectors(field.name, totalSize);
616566
}
617567
}
618568
}

0 commit comments

Comments
 (0)