Skip to content

Commit 0b66ad6

Browse files
Track vector disk usage by vectorReader.getOffHeapByteSize (#128326)
Currently IndexDiskUsageAnalyzer reports disk usage of vectors by - Iterating through document values to access vector data - Performing sample searches to force loading of the index structures - using a sampling approach (only visiting a subset of documents based on log scale) - tracking all bytes read during these operations One problem of this approach is that it is very slow. Another problem is that modifications to search algorithms and different encodings make it difficult to write definite test and assert expected results, hence a test failure such as #127689. This modifies IndexDiskUsageAnalyzer for vectors to use a new introduced in Lucene 10.3 method vectorReader.getOffHeapByteSize. As all vector files are offHeap, we can rely on this method to report the precise disk usage. Closes #127689
1 parent 7aba8fb commit 0b66ad6

File tree

3 files changed

+35
-66
lines changed

3 files changed

+35
-66
lines changed

muted-tests.yml

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -399,9 +399,6 @@ tests:
399399
- class: org.elasticsearch.xpack.ccr.action.ShardFollowTaskReplicationTests
400400
method: testChangeFollowerHistoryUUID
401401
issue: https://github.com/elastic/elasticsearch/issues/127680
402-
- class: org.elasticsearch.action.admin.indices.diskusage.IndexDiskUsageAnalyzerTests
403-
method: testKnnVectors
404-
issue: https://github.com/elastic/elasticsearch/issues/127689
405402
- class: org.elasticsearch.backwards.MixedClusterClientYamlTestSuiteIT
406403
method: test {p0=search/350_point_in_time/point-in-time with index filter}
407404
issue: https://github.com/elastic/elasticsearch/issues/127741

server/src/main/java/org/elasticsearch/action/admin/indices/diskusage/IndexDiskUsageAnalyzer.java

Lines changed: 13 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -25,16 +25,13 @@
2525
import org.apache.lucene.codecs.TermVectorsReader;
2626
import org.apache.lucene.codecs.lucene103.Lucene103PostingsFormat;
2727
import org.apache.lucene.index.BinaryDocValues;
28-
import org.apache.lucene.index.ByteVectorValues;
2928
import org.apache.lucene.index.DirectoryReader;
3029
import org.apache.lucene.index.DocValuesType;
3130
import org.apache.lucene.index.FieldInfo;
3231
import org.apache.lucene.index.FieldInfos;
3332
import org.apache.lucene.index.Fields;
34-
import org.apache.lucene.index.FloatVectorValues;
3533
import org.apache.lucene.index.IndexCommit;
3634
import org.apache.lucene.index.IndexOptions;
37-
import org.apache.lucene.index.KnnVectorValues;
3835
import org.apache.lucene.index.LeafReaderContext;
3936
import org.apache.lucene.index.NumericDocValues;
4037
import org.apache.lucene.index.PointValues;
@@ -47,8 +44,6 @@
4744
import org.apache.lucene.index.Terms;
4845
import org.apache.lucene.index.TermsEnum;
4946
import org.apache.lucene.search.DocIdSetIterator;
50-
import org.apache.lucene.search.KnnCollector;
51-
import org.apache.lucene.search.TopKnnCollector;
5247
import org.apache.lucene.store.Directory;
5348
import org.apache.lucene.store.FilterDirectory;
5449
import org.apache.lucene.store.IOContext;
@@ -553,7 +548,7 @@ void visitField(Fields vectors, String fieldName) throws IOException {
553548
}
554549
}
555550

556-
void analyzeKnnVectors(SegmentReader reader, IndexDiskUsageStats stats) throws IOException {
551+
void analyzeKnnVectors(SegmentReader reader, IndexDiskUsageStats stats) {
557552
KnnVectorsReader vectorReader = reader.getVectorReader();
558553
if (vectorReader == null) {
559554
return;
@@ -562,57 +557,19 @@ void analyzeKnnVectors(SegmentReader reader, IndexDiskUsageStats stats) throws I
562557
cancellationChecker.checkForCancellation();
563558
directory.resetBytesRead();
564559
if (field.getVectorDimension() > 0) {
565-
switch (field.getVectorEncoding()) {
566-
case BYTE -> {
567-
iterateDocValues(reader.maxDoc(), () -> vectorReader.getByteVectorValues(field.name).iterator(), vectors -> {
568-
cancellationChecker.logEvent();
569-
vectors.index();
570-
});
571-
572-
// do a couple of randomized searches to figure out min and max offsets of index file
573-
ByteVectorValues vectorValues = vectorReader.getByteVectorValues(field.name);
574-
KnnVectorValues.DocIndexIterator iterator = vectorValues.iterator();
575-
final KnnCollector collector = new TopKnnCollector(
576-
Math.max(1, Math.min(100, vectorValues.size() - 1)),
577-
Integer.MAX_VALUE
578-
);
579-
int numDocsToVisit = reader.maxDoc() < 10 ? reader.maxDoc() : 10 * (int) Math.log10(reader.maxDoc());
580-
int skipFactor = Math.max(reader.maxDoc() / numDocsToVisit, 1);
581-
for (int i = 0; i < reader.maxDoc(); i += skipFactor) {
582-
if ((i = iterator.advance(i)) == DocIdSetIterator.NO_MORE_DOCS) {
583-
break;
584-
}
585-
cancellationChecker.checkForCancellation();
586-
vectorReader.search(field.name, vectorValues.vectorValue(iterator.index()), collector, null);
587-
}
588-
stats.addKnnVectors(field.name, directory.getBytesRead());
589-
}
590-
case FLOAT32 -> {
591-
iterateDocValues(reader.maxDoc(), () -> vectorReader.getFloatVectorValues(field.name).iterator(), vectors -> {
592-
cancellationChecker.logEvent();
593-
vectors.index();
594-
});
595-
596-
// do a couple of randomized searches to figure out min and max offsets of index file
597-
FloatVectorValues vectorValues = vectorReader.getFloatVectorValues(field.name);
598-
KnnVectorValues.DocIndexIterator iterator = vectorValues.iterator();
599-
final KnnCollector collector = new TopKnnCollector(
600-
Math.max(1, Math.min(100, vectorValues.size() - 1)),
601-
Integer.MAX_VALUE
602-
);
603-
int numDocsToVisit = reader.maxDoc() < 10 ? reader.maxDoc() : 10 * (int) Math.log10(reader.maxDoc());
604-
int skipFactor = Math.max(reader.maxDoc() / numDocsToVisit, 1);
605-
for (int i = 0; i < reader.maxDoc(); i += skipFactor) {
606-
if ((i = iterator.advance(i)) == DocIdSetIterator.NO_MORE_DOCS) {
607-
break;
608-
}
609-
cancellationChecker.checkForCancellation();
610-
vectorReader.search(field.name, vectorValues.vectorValue(iterator.index()), collector, null);
611-
}
612-
stats.addKnnVectors(field.name, directory.getBytesRead());
613-
}
560+
Map<String, Long> offHeap = vectorReader.getOffHeapByteSize(field);
561+
long totalSize = 0;
562+
for (var entry : offHeap.entrySet()) {
563+
totalSize += entry.getValue();
614564
}
615-
565+
long vectorsSize = offHeap.getOrDefault("vec", 0L);
566+
if (vectorsSize == 0L) {
567+
// This can happen if .vec file is opened with directIO
568+
// calculate the size of vectors manually
569+
vectorsSize = field.getVectorDimension() * field.getVectorEncoding().byteSize;
570+
totalSize += vectorsSize;
571+
}
572+
stats.addKnnVectors(field.name, totalSize);
616573
}
617574
}
618575
}

server/src/test/java/org/elasticsearch/action/admin/indices/diskusage/IndexDiskUsageAnalyzerTests.java

Lines changed: 22 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
import org.apache.lucene.document.Field;
2525
import org.apache.lucene.document.FieldType;
2626
import org.apache.lucene.document.IntPoint;
27+
import org.apache.lucene.document.KnnByteVectorField;
2728
import org.apache.lucene.document.KnnFloatVectorField;
2829
import org.apache.lucene.document.LatLonShape;
2930
import org.apache.lucene.document.LongPoint;
@@ -67,6 +68,7 @@
6768
import org.elasticsearch.common.lucene.Lucene;
6869
import org.elasticsearch.core.IOUtils;
6970
import org.elasticsearch.index.codec.postings.ES812PostingsFormat;
71+
import org.elasticsearch.index.mapper.vectors.DenseVectorFieldMapper;
7072
import org.elasticsearch.index.shard.ShardId;
7173
import org.elasticsearch.index.store.LuceneFilesExtensions;
7274
import org.elasticsearch.test.ESTestCase;
@@ -254,15 +256,27 @@ public void testKnnVectors() throws Exception {
254256
VectorSimilarityFunction similarity = randomFrom(VectorSimilarityFunction.values());
255257
int numDocs = between(1000, 5000);
256258
int dimension = between(10, 200);
259+
DenseVectorFieldMapper.ElementType elementType = randomFrom(DenseVectorFieldMapper.ElementType.values());
257260

258-
indexRandomly(dir, codec, numDocs, doc -> {
259-
float[] vector = randomVector(dimension);
260-
doc.add(new KnnFloatVectorField("vector", vector, similarity));
261-
});
261+
if (elementType == DenseVectorFieldMapper.ElementType.FLOAT) {
262+
indexRandomly(dir, codec, numDocs, doc -> {
263+
float[] vector = randomVector(dimension);
264+
doc.add(new KnnFloatVectorField("vector", vector, similarity));
265+
});
266+
} else {
267+
indexRandomly(dir, codec, numDocs, doc -> {
268+
byte[] vector = new byte[dimension];
269+
random().nextBytes(vector);
270+
doc.add(new KnnByteVectorField("vector", vector, similarity));
271+
});
272+
}
262273
final IndexDiskUsageStats stats = IndexDiskUsageAnalyzer.analyze(testShardId(), lastCommit(dir), () -> {});
263274
logger.info("--> stats {}", stats);
264275

265-
long dataBytes = (long) numDocs * dimension * Float.BYTES; // size of flat vector data
276+
// expected size of flat vector data
277+
long dataBytes = elementType == DenseVectorFieldMapper.ElementType.FLOAT
278+
? ((long) numDocs * dimension * Float.BYTES)
279+
: ((long) numDocs * dimension);
266280
long indexBytesEstimate = (long) numDocs * (Lucene99HnswVectorsFormat.DEFAULT_MAX_CONN / 4); // rough size of HNSW graph
267281
assertThat("numDocs=" + numDocs + ";dimension=" + dimension, stats.total().getKnnVectorsBytes(), greaterThan(dataBytes));
268282
long connectionOverhead = stats.total().getKnnVectorsBytes() - dataBytes;
@@ -762,8 +776,9 @@ private static void assertStats(IndexDiskUsageStats actualStats, IndexDiskUsageS
762776
0.01,
763777
2048
764778
);
765-
766-
assertFieldStats(field, "knn vectors", actualField.getKnnVectorsBytes(), expectedField.getKnnVectorsBytes(), 0.01, 1024);
779+
// Allow difference of a file block size for knn vectors
780+
// we get knn data usage from getOffHeapByteSize but when written on disk it can be rounded to the next block size
781+
assertFieldStats(field, "knn vectors", actualField.getKnnVectorsBytes(), expectedField.getKnnVectorsBytes(), 0.01, 4096);
767782
}
768783
// We are not able to collect per field stats for stored, vector, points, and norms
769784
IndexDiskUsageStats.PerFieldDiskUsage actualTotal = actualStats.total();

0 commit comments

Comments
 (0)