From 9d6d67734dfd5e43f084f2cc5cb794025005c4b7 Mon Sep 17 00:00:00 2001 From: Mayya Sharipova Date: Mon, 26 May 2025 15:28:58 -0400 Subject: [PATCH 1/3] Track vector disk usage by vectorReader.getOffHeapByteSize Currently IndexDiskUsageAnalyzer reports disk usage of vectors by - Iterating through document values to access vector data - Performing sample searches to force loading of the index structures - using a sampling approach (only visiting a subset of documents based on log scale) - tracking all bytes read during these operations One problem of this approach is that it is very slow. Another problem is that modifications to search algorithms and different encodings make it difficult to write definite test and assert expected results, hence a test failure such as #127689. This modifies IndexDiskUsageAnalyzer for vectors to use a new introduced in Lucene 10.3 method vectorReader.getOffHeapByteSize. As all vector files are offHeap, we can rely on this method to report the precise disk usage. Closes #127689 --- muted-tests.yml | 3 - .../diskusage/IndexDiskUsageAnalyzer.java | 60 ++----------------- 2 files changed, 5 insertions(+), 58 deletions(-) diff --git a/muted-tests.yml b/muted-tests.yml index 758bfa607e9ef..e8579536675d2 100644 --- a/muted-tests.yml +++ b/muted-tests.yml @@ -399,9 +399,6 @@ tests: - class: org.elasticsearch.xpack.ccr.action.ShardFollowTaskReplicationTests method: testChangeFollowerHistoryUUID issue: https://github.com/elastic/elasticsearch/issues/127680 -- class: org.elasticsearch.action.admin.indices.diskusage.IndexDiskUsageAnalyzerTests - method: testKnnVectors - issue: https://github.com/elastic/elasticsearch/issues/127689 - class: org.elasticsearch.backwards.MixedClusterClientYamlTestSuiteIT method: test {p0=search/350_point_in_time/point-in-time with index filter} issue: https://github.com/elastic/elasticsearch/issues/127741 diff --git a/server/src/main/java/org/elasticsearch/action/admin/indices/diskusage/IndexDiskUsageAnalyzer.java b/server/src/main/java/org/elasticsearch/action/admin/indices/diskusage/IndexDiskUsageAnalyzer.java index 9d2595732c585..9ebd5f5664cc7 100644 --- a/server/src/main/java/org/elasticsearch/action/admin/indices/diskusage/IndexDiskUsageAnalyzer.java +++ b/server/src/main/java/org/elasticsearch/action/admin/indices/diskusage/IndexDiskUsageAnalyzer.java @@ -25,16 +25,13 @@ import org.apache.lucene.codecs.TermVectorsReader; import org.apache.lucene.codecs.lucene103.Lucene103PostingsFormat; import org.apache.lucene.index.BinaryDocValues; -import org.apache.lucene.index.ByteVectorValues; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.DocValuesType; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FieldInfos; import org.apache.lucene.index.Fields; -import org.apache.lucene.index.FloatVectorValues; import org.apache.lucene.index.IndexCommit; import org.apache.lucene.index.IndexOptions; -import org.apache.lucene.index.KnnVectorValues; import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.NumericDocValues; import org.apache.lucene.index.PointValues; @@ -47,8 +44,6 @@ import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.search.DocIdSetIterator; -import org.apache.lucene.search.KnnCollector; -import org.apache.lucene.search.TopKnnCollector; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FilterDirectory; import org.apache.lucene.store.IOContext; @@ -562,57 +557,12 @@ void analyzeKnnVectors(SegmentReader reader, IndexDiskUsageStats stats) throws I cancellationChecker.checkForCancellation(); directory.resetBytesRead(); if (field.getVectorDimension() > 0) { - switch (field.getVectorEncoding()) { - case BYTE -> { - iterateDocValues(reader.maxDoc(), () -> vectorReader.getByteVectorValues(field.name).iterator(), vectors -> { - cancellationChecker.logEvent(); - vectors.index(); - }); - - // do a couple of randomized searches to figure out min and max offsets of index file - ByteVectorValues vectorValues = vectorReader.getByteVectorValues(field.name); - KnnVectorValues.DocIndexIterator iterator = vectorValues.iterator(); - final KnnCollector collector = new TopKnnCollector( - Math.max(1, Math.min(100, vectorValues.size() - 1)), - Integer.MAX_VALUE - ); - int numDocsToVisit = reader.maxDoc() < 10 ? reader.maxDoc() : 10 * (int) Math.log10(reader.maxDoc()); - int skipFactor = Math.max(reader.maxDoc() / numDocsToVisit, 1); - for (int i = 0; i < reader.maxDoc(); i += skipFactor) { - if ((i = iterator.advance(i)) == DocIdSetIterator.NO_MORE_DOCS) { - break; - } - cancellationChecker.checkForCancellation(); - vectorReader.search(field.name, vectorValues.vectorValue(iterator.index()), collector, null); - } - stats.addKnnVectors(field.name, directory.getBytesRead()); - } - case FLOAT32 -> { - iterateDocValues(reader.maxDoc(), () -> vectorReader.getFloatVectorValues(field.name).iterator(), vectors -> { - cancellationChecker.logEvent(); - vectors.index(); - }); - - // do a couple of randomized searches to figure out min and max offsets of index file - FloatVectorValues vectorValues = vectorReader.getFloatVectorValues(field.name); - KnnVectorValues.DocIndexIterator iterator = vectorValues.iterator(); - final KnnCollector collector = new TopKnnCollector( - Math.max(1, Math.min(100, vectorValues.size() - 1)), - Integer.MAX_VALUE - ); - int numDocsToVisit = reader.maxDoc() < 10 ? reader.maxDoc() : 10 * (int) Math.log10(reader.maxDoc()); - int skipFactor = Math.max(reader.maxDoc() / numDocsToVisit, 1); - for (int i = 0; i < reader.maxDoc(); i += skipFactor) { - if ((i = iterator.advance(i)) == DocIdSetIterator.NO_MORE_DOCS) { - break; - } - cancellationChecker.checkForCancellation(); - vectorReader.search(field.name, vectorValues.vectorValue(iterator.index()), collector, null); - } - stats.addKnnVectors(field.name, directory.getBytesRead()); - } + Map offHeap = vectorReader.getOffHeapByteSize(field); + long totalSize = 0; + for (var entry : offHeap.entrySet()) { + totalSize += entry.getValue(); } - + stats.addKnnVectors(field.name, totalSize); } } } From 214a76a1dc7afe05bd562e52523887b41b862ed6 Mon Sep 17 00:00:00 2001 From: Mayya Sharipova Date: Tue, 27 May 2025 13:23:58 -0400 Subject: [PATCH 2/3] Adjust tests for index usage --- .../admin/indices/diskusage/IndexDiskUsageAnalyzerTests.java | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/server/src/test/java/org/elasticsearch/action/admin/indices/diskusage/IndexDiskUsageAnalyzerTests.java b/server/src/test/java/org/elasticsearch/action/admin/indices/diskusage/IndexDiskUsageAnalyzerTests.java index 58b847d7a87a1..0ee4e5394f744 100644 --- a/server/src/test/java/org/elasticsearch/action/admin/indices/diskusage/IndexDiskUsageAnalyzerTests.java +++ b/server/src/test/java/org/elasticsearch/action/admin/indices/diskusage/IndexDiskUsageAnalyzerTests.java @@ -762,8 +762,9 @@ private static void assertStats(IndexDiskUsageStats actualStats, IndexDiskUsageS 0.01, 2048 ); - - assertFieldStats(field, "knn vectors", actualField.getKnnVectorsBytes(), expectedField.getKnnVectorsBytes(), 0.01, 1024); + // Allow difference of a file block size for knn vectors + // we get knn data usage from getOffHeapByteSize but when written on disk it can be rounded to the next block size + assertFieldStats(field, "knn vectors", actualField.getKnnVectorsBytes(), expectedField.getKnnVectorsBytes(), 0.01, 4096); } // We are not able to collect per field stats for stored, vector, points, and norms IndexDiskUsageStats.PerFieldDiskUsage actualTotal = actualStats.total(); From 8bbe9bf9850ad41965d4a81711d37f5b8b877103 Mon Sep 17 00:00:00 2001 From: Mayya Sharipova Date: Fri, 6 Jun 2025 11:37:24 -0400 Subject: [PATCH 3/3] Iteration --- .../diskusage/IndexDiskUsageAnalyzer.java | 9 ++++++- .../IndexDiskUsageAnalyzerTests.java | 24 +++++++++++++++---- 2 files changed, 27 insertions(+), 6 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/action/admin/indices/diskusage/IndexDiskUsageAnalyzer.java b/server/src/main/java/org/elasticsearch/action/admin/indices/diskusage/IndexDiskUsageAnalyzer.java index 9ebd5f5664cc7..deae3d9f2610a 100644 --- a/server/src/main/java/org/elasticsearch/action/admin/indices/diskusage/IndexDiskUsageAnalyzer.java +++ b/server/src/main/java/org/elasticsearch/action/admin/indices/diskusage/IndexDiskUsageAnalyzer.java @@ -548,7 +548,7 @@ void visitField(Fields vectors, String fieldName) throws IOException { } } - void analyzeKnnVectors(SegmentReader reader, IndexDiskUsageStats stats) throws IOException { + void analyzeKnnVectors(SegmentReader reader, IndexDiskUsageStats stats) { KnnVectorsReader vectorReader = reader.getVectorReader(); if (vectorReader == null) { return; @@ -562,6 +562,13 @@ void analyzeKnnVectors(SegmentReader reader, IndexDiskUsageStats stats) throws I for (var entry : offHeap.entrySet()) { totalSize += entry.getValue(); } + long vectorsSize = offHeap.getOrDefault("vec", 0L); + if (vectorsSize == 0L) { + // This can happen if .vec file is opened with directIO + // calculate the size of vectors manually + vectorsSize = field.getVectorDimension() * field.getVectorEncoding().byteSize; + totalSize += vectorsSize; + } stats.addKnnVectors(field.name, totalSize); } } diff --git a/server/src/test/java/org/elasticsearch/action/admin/indices/diskusage/IndexDiskUsageAnalyzerTests.java b/server/src/test/java/org/elasticsearch/action/admin/indices/diskusage/IndexDiskUsageAnalyzerTests.java index 0ee4e5394f744..57de001ef90e5 100644 --- a/server/src/test/java/org/elasticsearch/action/admin/indices/diskusage/IndexDiskUsageAnalyzerTests.java +++ b/server/src/test/java/org/elasticsearch/action/admin/indices/diskusage/IndexDiskUsageAnalyzerTests.java @@ -24,6 +24,7 @@ import org.apache.lucene.document.Field; import org.apache.lucene.document.FieldType; import org.apache.lucene.document.IntPoint; +import org.apache.lucene.document.KnnByteVectorField; import org.apache.lucene.document.KnnFloatVectorField; import org.apache.lucene.document.LatLonShape; import org.apache.lucene.document.LongPoint; @@ -67,6 +68,7 @@ import org.elasticsearch.common.lucene.Lucene; import org.elasticsearch.core.IOUtils; import org.elasticsearch.index.codec.postings.ES812PostingsFormat; +import org.elasticsearch.index.mapper.vectors.DenseVectorFieldMapper; import org.elasticsearch.index.shard.ShardId; import org.elasticsearch.index.store.LuceneFilesExtensions; import org.elasticsearch.test.ESTestCase; @@ -254,15 +256,27 @@ public void testKnnVectors() throws Exception { VectorSimilarityFunction similarity = randomFrom(VectorSimilarityFunction.values()); int numDocs = between(1000, 5000); int dimension = between(10, 200); + DenseVectorFieldMapper.ElementType elementType = randomFrom(DenseVectorFieldMapper.ElementType.values()); - indexRandomly(dir, codec, numDocs, doc -> { - float[] vector = randomVector(dimension); - doc.add(new KnnFloatVectorField("vector", vector, similarity)); - }); + if (elementType == DenseVectorFieldMapper.ElementType.FLOAT) { + indexRandomly(dir, codec, numDocs, doc -> { + float[] vector = randomVector(dimension); + doc.add(new KnnFloatVectorField("vector", vector, similarity)); + }); + } else { + indexRandomly(dir, codec, numDocs, doc -> { + byte[] vector = new byte[dimension]; + random().nextBytes(vector); + doc.add(new KnnByteVectorField("vector", vector, similarity)); + }); + } final IndexDiskUsageStats stats = IndexDiskUsageAnalyzer.analyze(testShardId(), lastCommit(dir), () -> {}); logger.info("--> stats {}", stats); - long dataBytes = (long) numDocs * dimension * Float.BYTES; // size of flat vector data + // expected size of flat vector data + long dataBytes = elementType == DenseVectorFieldMapper.ElementType.FLOAT + ? ((long) numDocs * dimension * Float.BYTES) + : ((long) numDocs * dimension); long indexBytesEstimate = (long) numDocs * (Lucene99HnswVectorsFormat.DEFAULT_MAX_CONN / 4); // rough size of HNSW graph assertThat("numDocs=" + numDocs + ";dimension=" + dimension, stats.total().getKnnVectorsBytes(), greaterThan(dataBytes)); long connectionOverhead = stats.total().getKnnVectorsBytes() - dataBytes;