diff --git a/muted-tests.yml b/muted-tests.yml index 758bfa607e9ef..e8579536675d2 100644 --- a/muted-tests.yml +++ b/muted-tests.yml @@ -399,9 +399,6 @@ tests: - class: org.elasticsearch.xpack.ccr.action.ShardFollowTaskReplicationTests method: testChangeFollowerHistoryUUID issue: https://github.com/elastic/elasticsearch/issues/127680 -- class: org.elasticsearch.action.admin.indices.diskusage.IndexDiskUsageAnalyzerTests - method: testKnnVectors - issue: https://github.com/elastic/elasticsearch/issues/127689 - class: org.elasticsearch.backwards.MixedClusterClientYamlTestSuiteIT method: test {p0=search/350_point_in_time/point-in-time with index filter} issue: https://github.com/elastic/elasticsearch/issues/127741 diff --git a/server/src/main/java/org/elasticsearch/action/admin/indices/diskusage/IndexDiskUsageAnalyzer.java b/server/src/main/java/org/elasticsearch/action/admin/indices/diskusage/IndexDiskUsageAnalyzer.java index 9d2595732c585..deae3d9f2610a 100644 --- a/server/src/main/java/org/elasticsearch/action/admin/indices/diskusage/IndexDiskUsageAnalyzer.java +++ b/server/src/main/java/org/elasticsearch/action/admin/indices/diskusage/IndexDiskUsageAnalyzer.java @@ -25,16 +25,13 @@ import org.apache.lucene.codecs.TermVectorsReader; import org.apache.lucene.codecs.lucene103.Lucene103PostingsFormat; import org.apache.lucene.index.BinaryDocValues; -import org.apache.lucene.index.ByteVectorValues; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.DocValuesType; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FieldInfos; import org.apache.lucene.index.Fields; -import org.apache.lucene.index.FloatVectorValues; import org.apache.lucene.index.IndexCommit; import org.apache.lucene.index.IndexOptions; -import org.apache.lucene.index.KnnVectorValues; import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.NumericDocValues; import org.apache.lucene.index.PointValues; @@ -47,8 +44,6 @@ import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.search.DocIdSetIterator; -import org.apache.lucene.search.KnnCollector; -import org.apache.lucene.search.TopKnnCollector; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FilterDirectory; import org.apache.lucene.store.IOContext; @@ -553,7 +548,7 @@ void visitField(Fields vectors, String fieldName) throws IOException { } } - void analyzeKnnVectors(SegmentReader reader, IndexDiskUsageStats stats) throws IOException { + void analyzeKnnVectors(SegmentReader reader, IndexDiskUsageStats stats) { KnnVectorsReader vectorReader = reader.getVectorReader(); if (vectorReader == null) { return; @@ -562,57 +557,19 @@ void analyzeKnnVectors(SegmentReader reader, IndexDiskUsageStats stats) throws I cancellationChecker.checkForCancellation(); directory.resetBytesRead(); if (field.getVectorDimension() > 0) { - switch (field.getVectorEncoding()) { - case BYTE -> { - iterateDocValues(reader.maxDoc(), () -> vectorReader.getByteVectorValues(field.name).iterator(), vectors -> { - cancellationChecker.logEvent(); - vectors.index(); - }); - - // do a couple of randomized searches to figure out min and max offsets of index file - ByteVectorValues vectorValues = vectorReader.getByteVectorValues(field.name); - KnnVectorValues.DocIndexIterator iterator = vectorValues.iterator(); - final KnnCollector collector = new TopKnnCollector( - Math.max(1, Math.min(100, vectorValues.size() - 1)), - Integer.MAX_VALUE - ); - int numDocsToVisit = reader.maxDoc() < 10 ? reader.maxDoc() : 10 * (int) Math.log10(reader.maxDoc()); - int skipFactor = Math.max(reader.maxDoc() / numDocsToVisit, 1); - for (int i = 0; i < reader.maxDoc(); i += skipFactor) { - if ((i = iterator.advance(i)) == DocIdSetIterator.NO_MORE_DOCS) { - break; - } - cancellationChecker.checkForCancellation(); - vectorReader.search(field.name, vectorValues.vectorValue(iterator.index()), collector, null); - } - stats.addKnnVectors(field.name, directory.getBytesRead()); - } - case FLOAT32 -> { - iterateDocValues(reader.maxDoc(), () -> vectorReader.getFloatVectorValues(field.name).iterator(), vectors -> { - cancellationChecker.logEvent(); - vectors.index(); - }); - - // do a couple of randomized searches to figure out min and max offsets of index file - FloatVectorValues vectorValues = vectorReader.getFloatVectorValues(field.name); - KnnVectorValues.DocIndexIterator iterator = vectorValues.iterator(); - final KnnCollector collector = new TopKnnCollector( - Math.max(1, Math.min(100, vectorValues.size() - 1)), - Integer.MAX_VALUE - ); - int numDocsToVisit = reader.maxDoc() < 10 ? reader.maxDoc() : 10 * (int) Math.log10(reader.maxDoc()); - int skipFactor = Math.max(reader.maxDoc() / numDocsToVisit, 1); - for (int i = 0; i < reader.maxDoc(); i += skipFactor) { - if ((i = iterator.advance(i)) == DocIdSetIterator.NO_MORE_DOCS) { - break; - } - cancellationChecker.checkForCancellation(); - vectorReader.search(field.name, vectorValues.vectorValue(iterator.index()), collector, null); - } - stats.addKnnVectors(field.name, directory.getBytesRead()); - } + Map offHeap = vectorReader.getOffHeapByteSize(field); + long totalSize = 0; + for (var entry : offHeap.entrySet()) { + totalSize += entry.getValue(); } - + long vectorsSize = offHeap.getOrDefault("vec", 0L); + if (vectorsSize == 0L) { + // This can happen if .vec file is opened with directIO + // calculate the size of vectors manually + vectorsSize = field.getVectorDimension() * field.getVectorEncoding().byteSize; + totalSize += vectorsSize; + } + stats.addKnnVectors(field.name, totalSize); } } } diff --git a/server/src/test/java/org/elasticsearch/action/admin/indices/diskusage/IndexDiskUsageAnalyzerTests.java b/server/src/test/java/org/elasticsearch/action/admin/indices/diskusage/IndexDiskUsageAnalyzerTests.java index 58b847d7a87a1..57de001ef90e5 100644 --- a/server/src/test/java/org/elasticsearch/action/admin/indices/diskusage/IndexDiskUsageAnalyzerTests.java +++ b/server/src/test/java/org/elasticsearch/action/admin/indices/diskusage/IndexDiskUsageAnalyzerTests.java @@ -24,6 +24,7 @@ import org.apache.lucene.document.Field; import org.apache.lucene.document.FieldType; import org.apache.lucene.document.IntPoint; +import org.apache.lucene.document.KnnByteVectorField; import org.apache.lucene.document.KnnFloatVectorField; import org.apache.lucene.document.LatLonShape; import org.apache.lucene.document.LongPoint; @@ -67,6 +68,7 @@ import org.elasticsearch.common.lucene.Lucene; import org.elasticsearch.core.IOUtils; import org.elasticsearch.index.codec.postings.ES812PostingsFormat; +import org.elasticsearch.index.mapper.vectors.DenseVectorFieldMapper; import org.elasticsearch.index.shard.ShardId; import org.elasticsearch.index.store.LuceneFilesExtensions; import org.elasticsearch.test.ESTestCase; @@ -254,15 +256,27 @@ public void testKnnVectors() throws Exception { VectorSimilarityFunction similarity = randomFrom(VectorSimilarityFunction.values()); int numDocs = between(1000, 5000); int dimension = between(10, 200); + DenseVectorFieldMapper.ElementType elementType = randomFrom(DenseVectorFieldMapper.ElementType.values()); - indexRandomly(dir, codec, numDocs, doc -> { - float[] vector = randomVector(dimension); - doc.add(new KnnFloatVectorField("vector", vector, similarity)); - }); + if (elementType == DenseVectorFieldMapper.ElementType.FLOAT) { + indexRandomly(dir, codec, numDocs, doc -> { + float[] vector = randomVector(dimension); + doc.add(new KnnFloatVectorField("vector", vector, similarity)); + }); + } else { + indexRandomly(dir, codec, numDocs, doc -> { + byte[] vector = new byte[dimension]; + random().nextBytes(vector); + doc.add(new KnnByteVectorField("vector", vector, similarity)); + }); + } final IndexDiskUsageStats stats = IndexDiskUsageAnalyzer.analyze(testShardId(), lastCommit(dir), () -> {}); logger.info("--> stats {}", stats); - long dataBytes = (long) numDocs * dimension * Float.BYTES; // size of flat vector data + // expected size of flat vector data + long dataBytes = elementType == DenseVectorFieldMapper.ElementType.FLOAT + ? ((long) numDocs * dimension * Float.BYTES) + : ((long) numDocs * dimension); long indexBytesEstimate = (long) numDocs * (Lucene99HnswVectorsFormat.DEFAULT_MAX_CONN / 4); // rough size of HNSW graph assertThat("numDocs=" + numDocs + ";dimension=" + dimension, stats.total().getKnnVectorsBytes(), greaterThan(dataBytes)); long connectionOverhead = stats.total().getKnnVectorsBytes() - dataBytes; @@ -762,8 +776,9 @@ private static void assertStats(IndexDiskUsageStats actualStats, IndexDiskUsageS 0.01, 2048 ); - - assertFieldStats(field, "knn vectors", actualField.getKnnVectorsBytes(), expectedField.getKnnVectorsBytes(), 0.01, 1024); + // Allow difference of a file block size for knn vectors + // we get knn data usage from getOffHeapByteSize but when written on disk it can be rounded to the next block size + assertFieldStats(field, "knn vectors", actualField.getKnnVectorsBytes(), expectedField.getKnnVectorsBytes(), 0.01, 4096); } // We are not able to collect per field stats for stored, vector, points, and norms IndexDiskUsageStats.PerFieldDiskUsage actualTotal = actualStats.total();