Skip to content

Commit 8051f0d

Browse files
Track vector disk usage by vectorReader.getOffHeapByteSize
Currently IndexDiskUsageAnalyzer reports disk usage of vectors by - Iterating through document values to access vector data - Performing sample searches to force loading of the index structures - using a sampling approach (only visiting a subset of documents based on log scale) - tracking all bytes read during these operations One problem of this approach is that it is very slow. Another problem is that modifications to search algorithms and different encodings make it difficult to write definite test and assert expected results, hence a test failure such as #127689. This modifies IndexDiskUsageAnalyzer for vectors to use a new introduced in Lucene 10.3 method vectorReader.getOffHeapByteSize. As all vector files are offHeap, we can rely on this method to report the precise disk usage. Closes #127689
1 parent f77b0ee commit 8051f0d

File tree

2 files changed

+5
-58
lines changed

2 files changed

+5
-58
lines changed

muted-tests.yml

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -408,9 +408,6 @@ tests:
408408
- class: org.elasticsearch.xpack.ccr.action.ShardFollowTaskReplicationTests
409409
method: testChangeFollowerHistoryUUID
410410
issue: https://github.com/elastic/elasticsearch/issues/127680
411-
- class: org.elasticsearch.action.admin.indices.diskusage.IndexDiskUsageAnalyzerTests
412-
method: testKnnVectors
413-
issue: https://github.com/elastic/elasticsearch/issues/127689
414411
- class: org.elasticsearch.backwards.MixedClusterClientYamlTestSuiteIT
415412
method: test {p0=search/350_point_in_time/point-in-time with index filter}
416413
issue: https://github.com/elastic/elasticsearch/issues/127741

server/src/main/java/org/elasticsearch/action/admin/indices/diskusage/IndexDiskUsageAnalyzer.java

Lines changed: 5 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -25,16 +25,13 @@
2525
import org.apache.lucene.codecs.TermVectorsReader;
2626
import org.apache.lucene.codecs.lucene103.Lucene103PostingsFormat;
2727
import org.apache.lucene.index.BinaryDocValues;
28-
import org.apache.lucene.index.ByteVectorValues;
2928
import org.apache.lucene.index.DirectoryReader;
3029
import org.apache.lucene.index.DocValuesType;
3130
import org.apache.lucene.index.FieldInfo;
3231
import org.apache.lucene.index.FieldInfos;
3332
import org.apache.lucene.index.Fields;
34-
import org.apache.lucene.index.FloatVectorValues;
3533
import org.apache.lucene.index.IndexCommit;
3634
import org.apache.lucene.index.IndexOptions;
37-
import org.apache.lucene.index.KnnVectorValues;
3835
import org.apache.lucene.index.LeafReaderContext;
3936
import org.apache.lucene.index.NumericDocValues;
4037
import org.apache.lucene.index.PointValues;
@@ -47,8 +44,6 @@
4744
import org.apache.lucene.index.Terms;
4845
import org.apache.lucene.index.TermsEnum;
4946
import org.apache.lucene.search.DocIdSetIterator;
50-
import org.apache.lucene.search.KnnCollector;
51-
import org.apache.lucene.search.TopKnnCollector;
5247
import org.apache.lucene.store.Directory;
5348
import org.apache.lucene.store.FilterDirectory;
5449
import org.apache.lucene.store.IOContext;
@@ -562,57 +557,12 @@ void analyzeKnnVectors(SegmentReader reader, IndexDiskUsageStats stats) throws I
562557
cancellationChecker.checkForCancellation();
563558
directory.resetBytesRead();
564559
if (field.getVectorDimension() > 0) {
565-
switch (field.getVectorEncoding()) {
566-
case BYTE -> {
567-
iterateDocValues(reader.maxDoc(), () -> vectorReader.getByteVectorValues(field.name).iterator(), vectors -> {
568-
cancellationChecker.logEvent();
569-
vectors.index();
570-
});
571-
572-
// do a couple of randomized searches to figure out min and max offsets of index file
573-
ByteVectorValues vectorValues = vectorReader.getByteVectorValues(field.name);
574-
KnnVectorValues.DocIndexIterator iterator = vectorValues.iterator();
575-
final KnnCollector collector = new TopKnnCollector(
576-
Math.max(1, Math.min(100, vectorValues.size() - 1)),
577-
Integer.MAX_VALUE
578-
);
579-
int numDocsToVisit = reader.maxDoc() < 10 ? reader.maxDoc() : 10 * (int) Math.log10(reader.maxDoc());
580-
int skipFactor = Math.max(reader.maxDoc() / numDocsToVisit, 1);
581-
for (int i = 0; i < reader.maxDoc(); i += skipFactor) {
582-
if ((i = iterator.advance(i)) == DocIdSetIterator.NO_MORE_DOCS) {
583-
break;
584-
}
585-
cancellationChecker.checkForCancellation();
586-
vectorReader.search(field.name, vectorValues.vectorValue(iterator.index()), collector, null);
587-
}
588-
stats.addKnnVectors(field.name, directory.getBytesRead());
589-
}
590-
case FLOAT32 -> {
591-
iterateDocValues(reader.maxDoc(), () -> vectorReader.getFloatVectorValues(field.name).iterator(), vectors -> {
592-
cancellationChecker.logEvent();
593-
vectors.index();
594-
});
595-
596-
// do a couple of randomized searches to figure out min and max offsets of index file
597-
FloatVectorValues vectorValues = vectorReader.getFloatVectorValues(field.name);
598-
KnnVectorValues.DocIndexIterator iterator = vectorValues.iterator();
599-
final KnnCollector collector = new TopKnnCollector(
600-
Math.max(1, Math.min(100, vectorValues.size() - 1)),
601-
Integer.MAX_VALUE
602-
);
603-
int numDocsToVisit = reader.maxDoc() < 10 ? reader.maxDoc() : 10 * (int) Math.log10(reader.maxDoc());
604-
int skipFactor = Math.max(reader.maxDoc() / numDocsToVisit, 1);
605-
for (int i = 0; i < reader.maxDoc(); i += skipFactor) {
606-
if ((i = iterator.advance(i)) == DocIdSetIterator.NO_MORE_DOCS) {
607-
break;
608-
}
609-
cancellationChecker.checkForCancellation();
610-
vectorReader.search(field.name, vectorValues.vectorValue(iterator.index()), collector, null);
611-
}
612-
stats.addKnnVectors(field.name, directory.getBytesRead());
613-
}
560+
Map<String, Long> offHeap = vectorReader.getOffHeapByteSize(field);
561+
long totalSize = 0;
562+
for (var entry : offHeap.entrySet()) {
563+
totalSize += entry.getValue();
614564
}
615-
565+
stats.addKnnVectors(field.name, totalSize);
616566
}
617567
}
618568
}

0 commit comments

Comments
 (0)