diff --git a/benchmarks/src/main/java/org/elasticsearch/benchmark/index/codec/tsdb/TSDBDocValuesMergeBenchmark.java b/benchmarks/src/main/java/org/elasticsearch/benchmark/index/codec/tsdb/TSDBDocValuesMergeBenchmark.java index 1637cd33ac0af..a3b2fd3633adf 100644 --- a/benchmarks/src/main/java/org/elasticsearch/benchmark/index/codec/tsdb/TSDBDocValuesMergeBenchmark.java +++ b/benchmarks/src/main/java/org/elasticsearch/benchmark/index/codec/tsdb/TSDBDocValuesMergeBenchmark.java @@ -26,7 +26,7 @@ import org.apache.lucene.util.BytesRef; import org.elasticsearch.cluster.metadata.DataStream; import org.elasticsearch.common.logging.LogConfigurator; -import org.elasticsearch.index.codec.Elasticsearch900Lucene101Codec; +import org.elasticsearch.index.codec.Elasticsearch92Lucene103Codec; import org.elasticsearch.index.codec.tsdb.es819.ES819TSDBDocValuesFormat; import org.openjdk.jmh.annotations.Benchmark; import org.openjdk.jmh.annotations.BenchmarkMode; @@ -63,7 +63,6 @@ public class TSDBDocValuesMergeBenchmark { static { - // For Elasticsearch900Lucene101Codec: LogConfigurator.loadLog4jPlugins(); LogConfigurator.configureESLogging(); LogConfigurator.setNodeName("test"); @@ -259,8 +258,7 @@ private static IndexWriterConfig createIndexWriterConfig(boolean optimizedMergeE config.setLeafSorter(DataStream.TIMESERIES_LEAF_READERS_SORTER); config.setMergePolicy(new LogByteSizeMergePolicy()); var docValuesFormat = new ES819TSDBDocValuesFormat(4096, 512, optimizedMergeEnabled); - config.setCodec(new Elasticsearch900Lucene101Codec() { - + config.setCodec(new Elasticsearch92Lucene103Codec() { @Override public DocValuesFormat getDocValuesFormatForField(String field) { return docValuesFormat; diff --git a/build-tools-internal/version.properties b/build-tools-internal/version.properties index 8cd550b0b9ffe..82d5645af232e 100644 --- a/build-tools-internal/version.properties +++ b/build-tools-internal/version.properties @@ -1,5 +1,5 @@ elasticsearch = 9.2.0 -lucene = 10.2.2 +lucene = 10.3.0 bundled_jdk_vendor = openjdk bundled_jdk = 25+36@bd75d5f9689641da8e1daabeccb5528b diff --git a/distribution/tools/server-cli/src/main/java/org/elasticsearch/server/cli/SystemJvmOptions.java b/distribution/tools/server-cli/src/main/java/org/elasticsearch/server/cli/SystemJvmOptions.java index 3c0d3072b0e57..d2a7bbb7345d1 100644 --- a/distribution/tools/server-cli/src/main/java/org/elasticsearch/server/cli/SystemJvmOptions.java +++ b/distribution/tools/server-cli/src/main/java/org/elasticsearch/server/cli/SystemJvmOptions.java @@ -60,13 +60,6 @@ static List systemJvmOptions(Settings nodeSettings, final Map - - - + + + - - - + + + - - - + + + - - - + + + - - - + + + - - - + + + - - - + + + - - - + + + - - - + + + - - - + + + - - - + + + - - - + + + - - - + + + - - - + + + - - - + + + - - - + + + - - - + + + - - - + + + - - - + + + - - - + + + - - - + + + - - - + + + - - - + + + - - - + + + - - - + + + diff --git a/plugins/store-smb/src/main/java/org/elasticsearch/index/store/smb/SmbMmapFsDirectoryFactory.java b/plugins/store-smb/src/main/java/org/elasticsearch/index/store/smb/SmbMmapFsDirectoryFactory.java index c4b0519309311..5841804d438f8 100644 --- a/plugins/store-smb/src/main/java/org/elasticsearch/index/store/smb/SmbMmapFsDirectoryFactory.java +++ b/plugins/store-smb/src/main/java/org/elasticsearch/index/store/smb/SmbMmapFsDirectoryFactory.java @@ -26,7 +26,7 @@ public final class SmbMmapFsDirectoryFactory extends FsDirectoryFactory { protected Directory newFSDirectory(Path location, LockFactory lockFactory, IndexSettings indexSettings) throws IOException { MMapDirectory mMapDirectory = adjustSharedArenaGrouping(new MMapDirectory(location, lockFactory)); return new SmbDirectoryWrapper( - setPreload(mMapDirectory, new HashSet<>(indexSettings.getValue(IndexModule.INDEX_STORE_PRE_LOAD_SETTING))) + setMMapFunctions(mMapDirectory, new HashSet<>(indexSettings.getValue(IndexModule.INDEX_STORE_PRE_LOAD_SETTING))) ); } } diff --git a/qa/vector/src/main/java/org/elasticsearch/test/knn/KnnIndexTester.java b/qa/vector/src/main/java/org/elasticsearch/test/knn/KnnIndexTester.java index 9de3b7fa15b6f..421375f038475 100644 --- a/qa/vector/src/main/java/org/elasticsearch/test/knn/KnnIndexTester.java +++ b/qa/vector/src/main/java/org/elasticsearch/test/knn/KnnIndexTester.java @@ -13,7 +13,7 @@ import org.apache.lucene.codecs.Codec; import org.apache.lucene.codecs.KnnVectorsFormat; -import org.apache.lucene.codecs.lucene101.Lucene101Codec; +import org.apache.lucene.codecs.lucene103.Lucene103Codec; import org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; @@ -130,7 +130,7 @@ static Codec createCodec(CmdLineArgs args) { format = new Lucene99HnswVectorsFormat(args.hnswM(), args.hnswEfConstruction(), 1, null); } } - return new Lucene101Codec() { + return new Lucene103Codec() { @Override public KnnVectorsFormat getKnnVectorsFormatForField(String field) { return format; diff --git a/server/src/internalClusterTest/java/org/elasticsearch/index/store/DirectIOIT.java b/server/src/internalClusterTest/java/org/elasticsearch/index/store/DirectIOIT.java index 13be29a720611..7e23161cd42cf 100644 --- a/server/src/internalClusterTest/java/org/elasticsearch/index/store/DirectIOIT.java +++ b/server/src/internalClusterTest/java/org/elasticsearch/index/store/DirectIOIT.java @@ -46,6 +46,8 @@ @ESTestCase.WithoutEntitlements // requires entitlement delegation ES-10920 public class DirectIOIT extends ESIntegTestCase { + private static boolean SUPPORTED; + @BeforeClass public static void checkSupported() { assumeTrue("Direct IO is not enabled", ES818BinaryQuantizedVectorsFormat.USE_DIRECT_IO); @@ -53,8 +55,9 @@ public static void checkSupported() { Path path = createTempDir("directIOProbe"); try (Directory dir = open(path); IndexOutput out = dir.createOutput("out", IOContext.DEFAULT)) { out.writeString("test"); + SUPPORTED = true; } catch (IOException e) { - assumeNoException("test requires filesystem that supports Direct IO", e); + SUPPORTED = false; } } @@ -112,15 +115,21 @@ static void assertBBQIndexType(String type) { @TestLogging(value = "org.elasticsearch.index.store.FsDirectoryFactory:DEBUG", reason = "to capture trace logging for direct IO") public void testDirectIOUsed() { try (MockLog mockLog = MockLog.capture(FsDirectoryFactory.class)) { - // we're just looking for some evidence direct IO is used - mockLog.addExpectation( - new MockLog.PatternSeenEventExpectation( + // we're just looking for some evidence direct IO is used (or not) + MockLog.LoggingExpectation expectation = SUPPORTED + ? new MockLog.PatternSeenEventExpectation( "Direct IO used", FsDirectoryFactory.class.getCanonicalName(), Level.DEBUG, "Opening .*\\.vec with direct IO" ) - ); + : new MockLog.PatternSeenEventExpectation( + "Direct IO not used", + FsDirectoryFactory.class.getCanonicalName(), + Level.DEBUG, + "Could not open .*\\.vec with direct IO" + ); + mockLog.addExpectation(expectation); indexVectors(); diff --git a/server/src/internalClusterTest/java/org/elasticsearch/search/simple/SimpleSearchIT.java b/server/src/internalClusterTest/java/org/elasticsearch/search/simple/SimpleSearchIT.java index 5a9be73d92268..dc18835460d20 100644 --- a/server/src/internalClusterTest/java/org/elasticsearch/search/simple/SimpleSearchIT.java +++ b/server/src/internalClusterTest/java/org/elasticsearch/search/simple/SimpleSearchIT.java @@ -261,10 +261,12 @@ public void testSimpleTerminateAfterCount() throws Exception { ensureGreen(); refresh(); - for (int i = 1; i < max; i++) { + // query all but one doc to avoid optimizations that may rewrite to a MatchAllDocs, which simplifies assertions + final int queryMax = max - 1; + for (int i = 1; i < queryMax; i++) { final int finalI = i; assertResponse( - prepareSearch("test").setQuery(QueryBuilders.rangeQuery("field").gte(1).lte(max)).setTerminateAfter(i), + prepareSearch("test").setQuery(QueryBuilders.rangeQuery("field").gte(1).lte(queryMax)).setTerminateAfter(i), response -> { assertHitCount(response, finalI); assertTrue(response.isTerminatedEarly()); @@ -272,9 +274,9 @@ public void testSimpleTerminateAfterCount() throws Exception { ); } assertResponse( - prepareSearch("test").setQuery(QueryBuilders.rangeQuery("field").gte(1).lte(max)).setTerminateAfter(2 * max), + prepareSearch("test").setQuery(QueryBuilders.rangeQuery("field").gte(1).lte(queryMax)).setTerminateAfter(2 * max), response -> { - assertHitCount(response, max); + assertHitCount(response, queryMax); assertFalse(response.isTerminatedEarly()); } ); diff --git a/server/src/main/java/module-info.java b/server/src/main/java/module-info.java index 470ca69fb0d68..cfe6345d7e590 100644 --- a/server/src/main/java/module-info.java +++ b/server/src/main/java/module-info.java @@ -469,7 +469,8 @@ org.elasticsearch.index.codec.Elasticsearch814Codec, org.elasticsearch.index.codec.Elasticsearch816Codec, org.elasticsearch.index.codec.Elasticsearch900Codec, - org.elasticsearch.index.codec.Elasticsearch900Lucene101Codec; + org.elasticsearch.index.codec.Elasticsearch900Lucene101Codec, + org.elasticsearch.index.codec.Elasticsearch92Lucene103Codec; provides org.apache.logging.log4j.core.util.ContextDataProvider with org.elasticsearch.common.logging.DynamicContextDataProvider; diff --git a/server/src/main/java/org/elasticsearch/action/admin/indices/diskusage/IndexDiskUsageAnalyzer.java b/server/src/main/java/org/elasticsearch/action/admin/indices/diskusage/IndexDiskUsageAnalyzer.java index 0e28364316197..998356cb22abf 100644 --- a/server/src/main/java/org/elasticsearch/action/admin/indices/diskusage/IndexDiskUsageAnalyzer.java +++ b/server/src/main/java/org/elasticsearch/action/admin/indices/diskusage/IndexDiskUsageAnalyzer.java @@ -10,6 +10,7 @@ package org.elasticsearch.action.admin.indices.diskusage; import org.apache.logging.log4j.Logger; +import org.apache.lucene.backward_codecs.lucene101.Lucene101PostingsFormat; import org.apache.lucene.backward_codecs.lucene50.Lucene50PostingsFormat; import org.apache.lucene.backward_codecs.lucene84.Lucene84PostingsFormat; import org.apache.lucene.backward_codecs.lucene90.Lucene90PostingsFormat; @@ -22,18 +23,15 @@ import org.apache.lucene.codecs.PointsReader; import org.apache.lucene.codecs.StoredFieldsReader; import org.apache.lucene.codecs.TermVectorsReader; -import org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat; +import org.apache.lucene.codecs.lucene103.Lucene103PostingsFormat; import org.apache.lucene.index.BinaryDocValues; -import org.apache.lucene.index.ByteVectorValues; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.DocValuesType; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FieldInfos; import org.apache.lucene.index.Fields; -import org.apache.lucene.index.FloatVectorValues; import org.apache.lucene.index.IndexCommit; import org.apache.lucene.index.IndexOptions; -import org.apache.lucene.index.KnnVectorValues; import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.NumericDocValues; import org.apache.lucene.index.PointValues; @@ -46,8 +44,6 @@ import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.search.DocIdSetIterator; -import org.apache.lucene.search.KnnCollector; -import org.apache.lucene.search.TopKnnCollector; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FilterDirectory; import org.apache.lucene.store.IOContext; @@ -319,6 +315,9 @@ private static void readProximity(Terms terms, PostingsEnum postings) throws IOE private static BlockTermState getBlockTermState(TermsEnum termsEnum, BytesRef term) throws IOException { if (term != null && termsEnum.seekExact(term)) { final TermState termState = termsEnum.termState(); + if (termState instanceof final Lucene103PostingsFormat.IntBlockTermState blockTermState) { + return new BlockTermState(blockTermState.docStartFP, blockTermState.posStartFP, blockTermState.payStartFP); + } if (termState instanceof final Lucene101PostingsFormat.IntBlockTermState blockTermState) { return new BlockTermState(blockTermState.docStartFP, blockTermState.posStartFP, blockTermState.payStartFP); } @@ -550,7 +549,7 @@ void visitField(Fields vectors, String fieldName) throws IOException { } } - void analyzeKnnVectors(SegmentReader reader, IndexDiskUsageStats stats) throws IOException { + void analyzeKnnVectors(SegmentReader reader, IndexDiskUsageStats stats) { KnnVectorsReader vectorReader = reader.getVectorReader(); if (vectorReader == null) { return; @@ -559,57 +558,19 @@ void analyzeKnnVectors(SegmentReader reader, IndexDiskUsageStats stats) throws I cancellationChecker.checkForCancellation(); directory.resetBytesRead(); if (field.getVectorDimension() > 0) { - switch (field.getVectorEncoding()) { - case BYTE -> { - iterateDocValues(reader.maxDoc(), () -> vectorReader.getByteVectorValues(field.name).iterator(), vectors -> { - cancellationChecker.logEvent(); - vectors.index(); - }); - - // do a couple of randomized searches to figure out min and max offsets of index file - ByteVectorValues vectorValues = vectorReader.getByteVectorValues(field.name); - KnnVectorValues.DocIndexIterator iterator = vectorValues.iterator(); - final KnnCollector collector = new TopKnnCollector( - Math.max(1, Math.min(100, vectorValues.size() - 1)), - Integer.MAX_VALUE - ); - int numDocsToVisit = reader.maxDoc() < 10 ? reader.maxDoc() : 10 * (int) Math.log10(reader.maxDoc()); - int skipFactor = Math.max(reader.maxDoc() / numDocsToVisit, 1); - for (int i = 0; i < reader.maxDoc(); i += skipFactor) { - if ((i = iterator.advance(i)) == DocIdSetIterator.NO_MORE_DOCS) { - break; - } - cancellationChecker.checkForCancellation(); - vectorReader.search(field.name, vectorValues.vectorValue(iterator.index()), collector, null); - } - stats.addKnnVectors(field.name, directory.getBytesRead()); - } - case FLOAT32 -> { - iterateDocValues(reader.maxDoc(), () -> vectorReader.getFloatVectorValues(field.name).iterator(), vectors -> { - cancellationChecker.logEvent(); - vectors.index(); - }); - - // do a couple of randomized searches to figure out min and max offsets of index file - FloatVectorValues vectorValues = vectorReader.getFloatVectorValues(field.name); - KnnVectorValues.DocIndexIterator iterator = vectorValues.iterator(); - final KnnCollector collector = new TopKnnCollector( - Math.max(1, Math.min(100, vectorValues.size() - 1)), - Integer.MAX_VALUE - ); - int numDocsToVisit = reader.maxDoc() < 10 ? reader.maxDoc() : 10 * (int) Math.log10(reader.maxDoc()); - int skipFactor = Math.max(reader.maxDoc() / numDocsToVisit, 1); - for (int i = 0; i < reader.maxDoc(); i += skipFactor) { - if ((i = iterator.advance(i)) == DocIdSetIterator.NO_MORE_DOCS) { - break; - } - cancellationChecker.checkForCancellation(); - vectorReader.search(field.name, vectorValues.vectorValue(iterator.index()), collector, null); - } - stats.addKnnVectors(field.name, directory.getBytesRead()); - } + Map offHeap = vectorReader.getOffHeapByteSize(field); + long totalSize = 0; + for (var entry : offHeap.entrySet()) { + totalSize += entry.getValue(); } - + long vectorsSize = offHeap.getOrDefault("vec", 0L); + if (vectorsSize == 0L) { + // This can happen if .vec file is opened with directIO + // calculate the size of vectors manually + vectorsSize = field.getVectorDimension() * field.getVectorEncoding().byteSize; + totalSize += vectorsSize; + } + stats.addKnnVectors(field.name, totalSize); } } } diff --git a/server/src/main/java/org/elasticsearch/bootstrap/Elasticsearch.java b/server/src/main/java/org/elasticsearch/bootstrap/Elasticsearch.java index 8b59a59dcf313..c7b08add7d50b 100644 --- a/server/src/main/java/org/elasticsearch/bootstrap/Elasticsearch.java +++ b/server/src/main/java/org/elasticsearch/bootstrap/Elasticsearch.java @@ -40,7 +40,6 @@ import org.elasticsearch.entitlement.runtime.policy.entitlements.LoadNativeLibrariesEntitlement; import org.elasticsearch.env.Environment; import org.elasticsearch.index.IndexVersion; -import org.elasticsearch.index.codec.vectors.reflect.OffHeapReflectionUtils; import org.elasticsearch.jdk.JarHell; import org.elasticsearch.monitor.jvm.HotThreads; import org.elasticsearch.monitor.jvm.JvmInfo; @@ -216,9 +215,7 @@ private static void initPhase2(Bootstrap bootstrap) throws IOException { // RequestHandlerRegistry and MethodHandlers classes do nontrivial static initialization which should always succeed but load // it now (before SM) to be sure RequestHandlerRegistry.class, - MethodHandlers.class, - // Ensure member access and reflection lookup are as expected - OffHeapReflectionUtils.class + MethodHandlers.class ); // load the plugin Java modules and layers now for use in entitlements @@ -374,7 +371,7 @@ private static void reflectiveStartProcess(ProcessBuilder pb) throws Exception { private static void ensureInitialized(Class... classes) { for (final var clazz : classes) { try { - MethodHandles.lookup().ensureInitialized(clazz); + MethodHandles.publicLookup().ensureInitialized(clazz); } catch (IllegalAccessException unexpected) { throw new AssertionError(unexpected); } diff --git a/server/src/main/java/org/elasticsearch/common/lucene/Lucene.java b/server/src/main/java/org/elasticsearch/common/lucene/Lucene.java index d4162a3996032..b963ff4f649b6 100644 --- a/server/src/main/java/org/elasticsearch/common/lucene/Lucene.java +++ b/server/src/main/java/org/elasticsearch/common/lucene/Lucene.java @@ -93,7 +93,7 @@ public class Lucene { - public static final String LATEST_CODEC = "Lucene101"; + public static final String LATEST_CODEC = "Lucene103"; public static final String SOFT_DELETES_FIELD = "__soft_deletes"; diff --git a/server/src/main/java/org/elasticsearch/index/IndexVersions.java b/server/src/main/java/org/elasticsearch/index/IndexVersions.java index f3985c2dd5f8f..bad3dfbd38848 100644 --- a/server/src/main/java/org/elasticsearch/index/IndexVersions.java +++ b/server/src/main/java/org/elasticsearch/index/IndexVersions.java @@ -184,6 +184,7 @@ private static Version parseUnchecked(String version) { public static final IndexVersion EXCLUDE_SOURCE_VECTORS_DEFAULT = def(9_035_0_00, Version.LUCENE_10_2_2); public static final IndexVersion DISABLE_NORMS_BY_DEFAULT_FOR_LOGSDB_AND_TSDB = def(9_036_0_00, Version.LUCENE_10_2_2); public static final IndexVersion TSID_CREATED_DURING_ROUTING = def(9_037_0_00, Version.LUCENE_10_2_2); + public static final IndexVersion UPGRADE_TO_LUCENE_10_3_0 = def(9_038_0_00, Version.LUCENE_10_3_0); /* * STOP! READ THIS FIRST! No, really, diff --git a/server/src/main/java/org/elasticsearch/index/codec/vectors/reflect/OffHeapStats.java b/server/src/main/java/org/elasticsearch/index/StandardIOBehaviorHint.java similarity index 56% rename from server/src/main/java/org/elasticsearch/index/codec/vectors/reflect/OffHeapStats.java rename to server/src/main/java/org/elasticsearch/index/StandardIOBehaviorHint.java index 79eb118f389cc..ed32d6def7a32 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/vectors/reflect/OffHeapStats.java +++ b/server/src/main/java/org/elasticsearch/index/StandardIOBehaviorHint.java @@ -7,17 +7,13 @@ * License v3.0 only", or the "Server Side Public License, v 1". */ -package org.elasticsearch.index.codec.vectors.reflect; +package org.elasticsearch.index; -import org.apache.lucene.index.FieldInfo; - -import java.util.Map; +import org.apache.lucene.store.IOContext; /** - * Common interface to unify offHeapByteSize in ES' KnnVectorsReader implementations. - * Remove once KnnVectorsReaders::getOffHeapByteSize is available. + * A hint that no special behavior should be set on open files */ -public interface OffHeapStats { - - Map getOffHeapByteSize(FieldInfo fieldInfo); +public enum StandardIOBehaviorHint implements IOContext.FileOpenHint { + INSTANCE } diff --git a/server/src/main/java/org/elasticsearch/index/codec/CodecService.java b/server/src/main/java/org/elasticsearch/index/codec/CodecService.java index 5f887b2b594d3..17028137b78d8 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/CodecService.java +++ b/server/src/main/java/org/elasticsearch/index/codec/CodecService.java @@ -12,7 +12,7 @@ import org.apache.lucene.codecs.Codec; import org.apache.lucene.codecs.FieldInfosFormat; import org.apache.lucene.codecs.FilterCodec; -import org.apache.lucene.codecs.lucene101.Lucene101Codec; +import org.apache.lucene.codecs.lucene103.Lucene103Codec; import org.elasticsearch.common.util.BigArrays; import org.elasticsearch.common.util.FeatureFlag; import org.elasticsearch.core.Nullable; @@ -46,7 +46,7 @@ public class CodecService implements CodecProvider { public CodecService(@Nullable MapperService mapperService, BigArrays bigArrays) { final var codecs = new HashMap(); - Codec legacyBestSpeedCodec = new LegacyPerFieldMapperCodec(Lucene101Codec.Mode.BEST_SPEED, mapperService, bigArrays); + Codec legacyBestSpeedCodec = new LegacyPerFieldMapperCodec(Lucene103Codec.Mode.BEST_SPEED, mapperService, bigArrays); if (ZSTD_STORED_FIELDS_FEATURE_FLAG) { codecs.put(DEFAULT_CODEC, new PerFieldMapperCodec(Zstd814StoredFieldsFormat.Mode.BEST_SPEED, mapperService, bigArrays)); } else { @@ -58,7 +58,7 @@ public CodecService(@Nullable MapperService mapperService, BigArrays bigArrays) BEST_COMPRESSION_CODEC, new PerFieldMapperCodec(Zstd814StoredFieldsFormat.Mode.BEST_COMPRESSION, mapperService, bigArrays) ); - Codec legacyBestCompressionCodec = new LegacyPerFieldMapperCodec(Lucene101Codec.Mode.BEST_COMPRESSION, mapperService, bigArrays); + Codec legacyBestCompressionCodec = new LegacyPerFieldMapperCodec(Lucene103Codec.Mode.BEST_COMPRESSION, mapperService, bigArrays); codecs.put(LEGACY_BEST_COMPRESSION_CODEC, legacyBestCompressionCodec); codecs.put(LUCENE_DEFAULT_CODEC, Codec.getDefault()); diff --git a/server/src/main/java/org/elasticsearch/index/codec/Elasticsearch900Lucene101Codec.java b/server/src/main/java/org/elasticsearch/index/codec/Elasticsearch900Lucene101Codec.java index 3edd55d8f8de7..ad2c40950b6c9 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/Elasticsearch900Lucene101Codec.java +++ b/server/src/main/java/org/elasticsearch/index/codec/Elasticsearch900Lucene101Codec.java @@ -9,12 +9,12 @@ package org.elasticsearch.index.codec; +import org.apache.lucene.backward_codecs.lucene101.Lucene101Codec; +import org.apache.lucene.backward_codecs.lucene101.Lucene101PostingsFormat; import org.apache.lucene.codecs.DocValuesFormat; import org.apache.lucene.codecs.KnnVectorsFormat; import org.apache.lucene.codecs.PostingsFormat; import org.apache.lucene.codecs.StoredFieldsFormat; -import org.apache.lucene.codecs.lucene101.Lucene101Codec; -import org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat; import org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat; import org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat; import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat; diff --git a/server/src/main/java/org/elasticsearch/index/codec/Elasticsearch92Lucene103Codec.java b/server/src/main/java/org/elasticsearch/index/codec/Elasticsearch92Lucene103Codec.java new file mode 100644 index 0000000000000..c26d485fc8c99 --- /dev/null +++ b/server/src/main/java/org/elasticsearch/index/codec/Elasticsearch92Lucene103Codec.java @@ -0,0 +1,133 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.index.codec; + +import org.apache.lucene.codecs.DocValuesFormat; +import org.apache.lucene.codecs.KnnVectorsFormat; +import org.apache.lucene.codecs.PostingsFormat; +import org.apache.lucene.codecs.StoredFieldsFormat; +import org.apache.lucene.codecs.lucene103.Lucene103Codec; +import org.apache.lucene.codecs.lucene103.Lucene103PostingsFormat; +import org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat; +import org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat; +import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat; +import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat; +import org.elasticsearch.index.codec.perfield.XPerFieldDocValuesFormat; +import org.elasticsearch.index.codec.zstd.Zstd814StoredFieldsFormat; + +/** + * Elasticsearch codec as of 9.2 relying on Lucene 10.3. This extends the Lucene 10.3 codec to compressed + * stored fields with ZSTD instead of LZ4/DEFLATE. See {@link Zstd814StoredFieldsFormat}. + */ +public class Elasticsearch92Lucene103Codec extends CodecService.DeduplicateFieldInfosCodec { + + static final PostingsFormat DEFAULT_POSTINGS_FORMAT = new Lucene103PostingsFormat(); + + private final StoredFieldsFormat storedFieldsFormat; + + private final PostingsFormat defaultPostingsFormat; + private final PostingsFormat postingsFormat = new PerFieldPostingsFormat() { + @Override + public PostingsFormat getPostingsFormatForField(String field) { + return Elasticsearch92Lucene103Codec.this.getPostingsFormatForField(field); + } + }; + + private final DocValuesFormat defaultDVFormat; + private final DocValuesFormat docValuesFormat = new XPerFieldDocValuesFormat() { + @Override + public DocValuesFormat getDocValuesFormatForField(String field) { + return Elasticsearch92Lucene103Codec.this.getDocValuesFormatForField(field); + } + }; + + private final KnnVectorsFormat defaultKnnVectorsFormat; + private final KnnVectorsFormat knnVectorsFormat = new PerFieldKnnVectorsFormat() { + @Override + public KnnVectorsFormat getKnnVectorsFormatForField(String field) { + return Elasticsearch92Lucene103Codec.this.getKnnVectorsFormatForField(field); + } + }; + + /** Public no-arg constructor, needed for SPI loading at read-time. */ + public Elasticsearch92Lucene103Codec() { + this(Zstd814StoredFieldsFormat.Mode.BEST_SPEED); + } + + /** + * Constructor. Takes a {@link Zstd814StoredFieldsFormat.Mode} that describes whether to optimize for retrieval speed at the expense of + * worse space-efficiency or vice-versa. + */ + public Elasticsearch92Lucene103Codec(Zstd814StoredFieldsFormat.Mode mode) { + super("Elasticsearch92Lucene103", new Lucene103Codec()); + this.storedFieldsFormat = mode.getFormat(); + this.defaultPostingsFormat = DEFAULT_POSTINGS_FORMAT; + this.defaultDVFormat = new Lucene90DocValuesFormat(); + this.defaultKnnVectorsFormat = new Lucene99HnswVectorsFormat(); + } + + @Override + public StoredFieldsFormat storedFieldsFormat() { + return storedFieldsFormat; + } + + @Override + public final PostingsFormat postingsFormat() { + return postingsFormat; + } + + @Override + public final DocValuesFormat docValuesFormat() { + return docValuesFormat; + } + + @Override + public final KnnVectorsFormat knnVectorsFormat() { + return knnVectorsFormat; + } + + /** + * Returns the postings format that should be used for writing new segments of field. + * + *

The default implementation always returns "Lucene912". + * + *

WARNING: if you subclass, you are responsible for index backwards compatibility: + * future version of Lucene are only guaranteed to be able to read the default implementation, + */ + public PostingsFormat getPostingsFormatForField(String field) { + return defaultPostingsFormat; + } + + /** + * Returns the docvalues format that should be used for writing new segments of field + * . + * + *

The default implementation always returns "Lucene912". + * + *

WARNING: if you subclass, you are responsible for index backwards compatibility: + * future version of Lucene are only guaranteed to be able to read the default implementation. + */ + public DocValuesFormat getDocValuesFormatForField(String field) { + return defaultDVFormat; + } + + /** + * Returns the vectors format that should be used for writing new segments of field + * + *

The default implementation always returns "Lucene912". + * + *

WARNING: if you subclass, you are responsible for index backwards compatibility: + * future version of Lucene are only guaranteed to be able to read the default implementation. + */ + public KnnVectorsFormat getKnnVectorsFormatForField(String field) { + return defaultKnnVectorsFormat; + } + +} diff --git a/server/src/main/java/org/elasticsearch/index/codec/LegacyPerFieldMapperCodec.java b/server/src/main/java/org/elasticsearch/index/codec/LegacyPerFieldMapperCodec.java index 9e4ecb1a46c17..c3e9ab6617b87 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/LegacyPerFieldMapperCodec.java +++ b/server/src/main/java/org/elasticsearch/index/codec/LegacyPerFieldMapperCodec.java @@ -13,7 +13,7 @@ import org.apache.lucene.codecs.DocValuesFormat; import org.apache.lucene.codecs.KnnVectorsFormat; import org.apache.lucene.codecs.PostingsFormat; -import org.apache.lucene.codecs.lucene101.Lucene101Codec; +import org.apache.lucene.codecs.lucene103.Lucene103Codec; import org.elasticsearch.common.lucene.Lucene; import org.elasticsearch.common.util.BigArrays; import org.elasticsearch.index.mapper.MapperService; @@ -22,11 +22,11 @@ * Legacy version of {@link PerFieldMapperCodec}. This codec is preserved to give an escape hatch in case we encounter issues with new * changes in {@link PerFieldMapperCodec}. */ -public final class LegacyPerFieldMapperCodec extends Lucene101Codec { +public final class LegacyPerFieldMapperCodec extends Lucene103Codec { private final PerFieldFormatSupplier formatSupplier; - public LegacyPerFieldMapperCodec(Lucene101Codec.Mode compressionMode, MapperService mapperService, BigArrays bigArrays) { + public LegacyPerFieldMapperCodec(Lucene103Codec.Mode compressionMode, MapperService mapperService, BigArrays bigArrays) { super(compressionMode); this.formatSupplier = new PerFieldFormatSupplier(mapperService, bigArrays); // If the below assertion fails, it is a sign that Lucene released a new codec. You must create a copy of the current Elasticsearch diff --git a/server/src/main/java/org/elasticsearch/index/codec/PerFieldFormatSupplier.java b/server/src/main/java/org/elasticsearch/index/codec/PerFieldFormatSupplier.java index 11362c6e68cd7..e3d3644f791cf 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/PerFieldFormatSupplier.java +++ b/server/src/main/java/org/elasticsearch/index/codec/PerFieldFormatSupplier.java @@ -75,9 +75,9 @@ public PerFieldFormatSupplier(MapperService mapperService, BigArrays bigArrays) this.bloomFilterPostingsFormat = new ES87BloomFilterPostingsFormat(bigArrays, this::internalGetPostingsFormatForField); if (mapperService != null - && mapperService.getIndexSettings().getIndexVersionCreated().onOrAfter(IndexVersions.USE_LUCENE101_POSTINGS_FORMAT) + && mapperService.getIndexSettings().getIndexVersionCreated().onOrAfter(IndexVersions.UPGRADE_TO_LUCENE_10_3_0) && mapperService.getIndexSettings().getMode().useDefaultPostingsFormat()) { - defaultPostingsFormat = Elasticsearch900Lucene101Codec.DEFAULT_POSTINGS_FORMAT; + defaultPostingsFormat = Elasticsearch92Lucene103Codec.DEFAULT_POSTINGS_FORMAT; } else { // our own posting format using PFOR defaultPostingsFormat = es812PostingsFormat; diff --git a/server/src/main/java/org/elasticsearch/index/codec/PerFieldMapperCodec.java b/server/src/main/java/org/elasticsearch/index/codec/PerFieldMapperCodec.java index 9a3055f96bba8..0ffb63270fd58 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/PerFieldMapperCodec.java +++ b/server/src/main/java/org/elasticsearch/index/codec/PerFieldMapperCodec.java @@ -26,7 +26,7 @@ * per index in real time via the mapping API. If no specific postings format or vector format is * configured for a specific field the default postings or vector format is used. */ -public final class PerFieldMapperCodec extends Elasticsearch900Lucene101Codec { +public final class PerFieldMapperCodec extends Elasticsearch92Lucene103Codec { private final PerFieldFormatSupplier formatSupplier; diff --git a/server/src/main/java/org/elasticsearch/index/codec/TrackingPostingsInMemoryBytesCodec.java b/server/src/main/java/org/elasticsearch/index/codec/TrackingPostingsInMemoryBytesCodec.java index 8365930bdc510..6afe6815caf26 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/TrackingPostingsInMemoryBytesCodec.java +++ b/server/src/main/java/org/elasticsearch/index/codec/TrackingPostingsInMemoryBytesCodec.java @@ -30,7 +30,7 @@ /** * A codec that tracks the length of the min and max written terms. Used to improve memory usage estimates in serverless, since - * {@link org.apache.lucene.codecs.lucene90.blocktree.FieldReader} keeps an in-memory reference to the min and max term. + * {@link org.apache.lucene.codecs.lucene103.blocktree.FieldReader} keeps an in-memory reference to the min and max term. */ public class TrackingPostingsInMemoryBytesCodec extends FilterCodec { public static final String IN_MEMORY_POSTINGS_BYTES_KEY = "es.postings.in_memory_bytes"; diff --git a/server/src/main/java/org/elasticsearch/index/codec/postings/ES812PostingsFormat.java b/server/src/main/java/org/elasticsearch/index/codec/postings/ES812PostingsFormat.java index 6ccfaba7853f2..4fb6bcd00b1fb 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/postings/ES812PostingsFormat.java +++ b/server/src/main/java/org/elasticsearch/index/codec/postings/ES812PostingsFormat.java @@ -19,6 +19,7 @@ */ package org.elasticsearch.index.codec.postings; +import org.apache.lucene.backward_codecs.lucene90.blocktree.Lucene90BlockTreeTermsReader; import org.apache.lucene.codecs.BlockTermState; import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.codecs.FieldsConsumer; @@ -27,8 +28,6 @@ import org.apache.lucene.codecs.PostingsFormat; import org.apache.lucene.codecs.PostingsReaderBase; import org.apache.lucene.codecs.PostingsWriterBase; -import org.apache.lucene.codecs.lucene90.blocktree.Lucene90BlockTreeTermsReader; -import org.apache.lucene.codecs.lucene90.blocktree.Lucene90BlockTreeTermsWriter; import org.apache.lucene.index.IndexOptions; import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SegmentWriteState; @@ -100,7 +99,7 @@ *

Term Dictionary *

The .tim file contains the list of terms in each field along with per-term statistics * (such as docfreq) and pointers to the frequencies, positions, payload and skip data in the - * .doc, .pos, and .pay files. See {@link Lucene90BlockTreeTermsWriter} for more details on + * .doc, .pos, and .pay files. See {@link Lucene90BlockTreeTermsReader} for more details on * the format. *

NOTE: The term dictionary can plug into different postings implementations: the postings * writer/reader are actually responsible for encoding and decoding the PostingsHeader and @@ -155,7 +154,7 @@ *

*
Term Index *

The .tip file contains an index into the term dictionary, so that it can be accessed - * randomly. See {@link Lucene90BlockTreeTermsWriter} for more details on the format. + * randomly. See {@link Lucene90BlockTreeTermsReader} for more details on the format. *

* * @@ -343,7 +342,7 @@ * * */ -public final class ES812PostingsFormat extends PostingsFormat { +public class ES812PostingsFormat extends PostingsFormat { /** * Filename extension for document number, frequencies, and skip data. See chapter: Writes terms dict and index, block-encoding (column stride) each term's metadata for each set + * of terms between two index terms. + * + *

Files: + * + *

+ * + *

+ * + *

Term Dictionary

+ * + *

The .tim file contains the list of terms in each field along with per-term statistics (such as + * docfreq) and per-term metadata (typically pointers to the postings list for that term in the + * inverted index). + * + *

The .tim is arranged in blocks: with blocks containing a variable number of entries (by + * default 25-48), where each entry is either a term or a reference to a sub-block. + * + *

NOTE: The term dictionary can plug into different postings implementations: the postings + * writer/reader are actually responsible for encoding and decoding the Postings Metadata and Term + * Metadata sections. + * + *

    + *
  • TermsDict (.tim) --> Header, FieldDictNumFields, Footer + *
  • FieldDict --> PostingsHeader, NodeBlockNumBlocks + *
  • NodeBlock --> (OuterNode | InnerNode) + *
  • OuterNode --> EntryCount, SuffixLength, ByteSuffixLength, StatsLength, < + * TermStats >EntryCount, MetaLength, + * <TermMetadata>EntryCount + *
  • InnerNode --> EntryCount, SuffixLength[,Sub?], ByteSuffixLength, StatsLength, + * < TermStats ? >EntryCount, MetaLength, <TermMetadata ? + * >EntryCount + *
  • TermStats --> DocFreq, TotalTermFreq + *
  • Header --> {@link CodecUtil#writeHeader CodecHeader} + *
  • EntryCount,SuffixLength,StatsLength,DocFreq,MetaLength --> {@link DataOutput#writeVInt + * VInt} + *
  • TotalTermFreq --> {@link DataOutput#writeVLong VLong} + *
  • Footer --> {@link CodecUtil#writeFooter CodecFooter} + *
+ * + *

Notes: + * + *

    + *
  • Header is a {@link CodecUtil#writeHeader CodecHeader} storing the version information for + * the BlockTree implementation. + *
  • DocFreq is the count of documents which contain the term. + *
  • TotalTermFreq is the total number of occurrences of the term. This is encoded as the + * difference between the total number of occurrences and the DocFreq. + *
  • PostingsHeader and TermMetadata are plugged into by the specific postings implementation: + * these contain arbitrary per-file data (such as parameters or versioning information) and + * per-term data (such as pointers to inverted files). + *
  • For inner nodes of the tree, every entry will steal one bit to mark whether it points to + * child nodes(sub-block). If so, the corresponding TermStats and TermMetaData are omitted. + *
+ * + *

+ * + *

Term Metadata

+ * + *

The .tmd file contains the list of term metadata (such as FST index metadata) and field level + * statistics (such as sum of total term freq). + * + *

    + *
  • TermsMeta (.tmd) --> Header, NumFields, <FieldStats>NumFields, + * TermIndexLength, TermDictLength, Footer + *
  • FieldStats --> FieldNumber, NumTerms, RootCodeLength, ByteRootCodeLength, + * SumTotalTermFreq?, SumDocFreq, DocCount, MinTerm, MaxTerm, IndexStartFP, FSTHeader, + * FSTMetadata + *
  • Header,FSTHeader --> {@link CodecUtil#writeHeader CodecHeader} + *
  • TermIndexLength, TermDictLength --> {@link DataOutput#writeLong Uint64} + *
  • MinTerm,MaxTerm --> {@link DataOutput#writeVInt VInt} length followed by the byte[] + *
  • NumFields,FieldNumber,RootCodeLength,DocCount --> {@link DataOutput#writeVInt VInt} + *
  • NumTerms,SumTotalTermFreq,SumDocFreq,IndexStartFP --> {@link DataOutput#writeVLong + * VLong} + *
  • Footer --> {@link CodecUtil#writeFooter CodecFooter} + *
+ * + *

Notes: + * + *

    + *
  • FieldNumber is the fields number from {@link FieldInfos}. (.fnm) + *
  • NumTerms is the number of unique terms for the field. + *
  • RootCode points to the root block for the field. + *
  • SumDocFreq is the total number of postings, the number of term-document pairs across the + * entire field. + *
  • DocCount is the number of documents that have at least one posting for this field. + *
  • MinTerm, MaxTerm are the lowest and highest term in this field. + *
+ * + * + * + *

Term Index

+ * + *

The .tip file contains an index into the term dictionary, so that it can be accessed randomly. + * The index is also used to determine when a given term cannot exist on disk (in the .tim file), + * saving a disk seek. + * + *

    + *
  • TermsIndex (.tip) --> Header, FSTIndexNumFieldsFooter + *
  • Header --> {@link CodecUtil#writeHeader CodecHeader} + * + *
  • FSTIndex --> {@link FST FST<byte[]>} + *
  • Footer --> {@link CodecUtil#writeFooter CodecFooter} + *
+ * + *

Notes: + * + *

    + *
  • The .tip file contains a separate FST for each field. The FST maps a term prefix to the + * on-disk block that holds all terms starting with that prefix. Each field's IndexStartFP + * points to its FST. + *
  • It's possible that an on-disk block would contain too many terms (more than the allowed + * maximum (default: 48)). When this happens, the block is sub-divided into new blocks (called + * "floor blocks"), and then the output in the FST for the block's prefix encodes the leading + * byte of each sub-block, and its file pointer. + *
+ * + * @see Lucene90BlockTreeTermsReader + */ +public final class Lucene90BlockTreeTermsWriter extends FieldsConsumer { + + /** + * Suggested default value for the {@code minItemsInBlock} parameter to {@link + * #Lucene90BlockTreeTermsWriter(SegmentWriteState,PostingsWriterBase,int,int)}. + */ + public static final int DEFAULT_MIN_BLOCK_SIZE = 25; + + /** + * Suggested default value for the {@code maxItemsInBlock} parameter to {@link + * #Lucene90BlockTreeTermsWriter(SegmentWriteState,PostingsWriterBase,int,int)}. + */ + public static final int DEFAULT_MAX_BLOCK_SIZE = 48; + + public static final int OUTPUT_FLAGS_NUM_BITS = 2; + public static final int OUTPUT_FLAGS_MASK = 0x3; + public static final int OUTPUT_FLAG_IS_FLOOR = 0x1; + public static final int OUTPUT_FLAG_HAS_TERMS = 0x2; + + /** Extension of terms meta file */ + static final String TERMS_EXTENSION = "tim"; + static final String TERMS_CODEC_NAME = "BlockTreeTermsDict"; + + // public static boolean DEBUG = false; + // public static boolean DEBUG2 = false; + + // private final static boolean SAVE_DOT_FILES = false; + + private final IndexOutput metaOut; + private final IndexOutput termsOut; + private final IndexOutput indexOut; + final int maxDoc; + final int minItemsInBlock; + final int maxItemsInBlock; + final int version; + + final PostingsWriterBase postingsWriter; + final FieldInfos fieldInfos; + + private final List fields = new ArrayList<>(); + + /** + * Create a new writer. The number of items (terms or sub-blocks) per block will aim to be between + * minItemsPerBlock and maxItemsPerBlock, though in some cases the blocks may be smaller than the + * min. + */ + public Lucene90BlockTreeTermsWriter( + SegmentWriteState state, + PostingsWriterBase postingsWriter, + int minItemsInBlock, + int maxItemsInBlock + ) throws IOException { + this(state, postingsWriter, minItemsInBlock, maxItemsInBlock, Lucene90BlockTreeTermsReader.VERSION_CURRENT); + } + + /** Expert constructor that allows configuring the version, used for bw tests. */ + public Lucene90BlockTreeTermsWriter( + SegmentWriteState state, + PostingsWriterBase postingsWriter, + int minItemsInBlock, + int maxItemsInBlock, + int version + ) throws IOException { + validateSettings(minItemsInBlock, maxItemsInBlock); + + this.minItemsInBlock = minItemsInBlock; + this.maxItemsInBlock = maxItemsInBlock; + if (version < Lucene90BlockTreeTermsReader.VERSION_START || version > Lucene90BlockTreeTermsReader.VERSION_CURRENT) { + throw new IllegalArgumentException( + "Expected version in range [" + + Lucene90BlockTreeTermsReader.VERSION_START + + ", " + + Lucene90BlockTreeTermsReader.VERSION_CURRENT + + "], but got " + + version + ); + } + this.version = version; + + this.maxDoc = state.segmentInfo.maxDoc(); + this.fieldInfos = state.fieldInfos; + this.postingsWriter = postingsWriter; + + final String termsName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, TERMS_EXTENSION); + termsOut = state.directory.createOutput(termsName, state.context); + boolean success = false; + IndexOutput metaOut = null, indexOut = null; + try { + CodecUtil.writeIndexHeader(termsOut, TERMS_CODEC_NAME, version, state.segmentInfo.getId(), state.segmentSuffix); + + final String indexName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, TERMS_INDEX_EXTENSION); + indexOut = state.directory.createOutput(indexName, state.context); + CodecUtil.writeIndexHeader(indexOut, TERMS_INDEX_CODEC_NAME, version, state.segmentInfo.getId(), state.segmentSuffix); + // segment = state.segmentInfo.name; + + final String metaName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, TERMS_META_EXTENSION); + metaOut = state.directory.createOutput(metaName, state.context); + CodecUtil.writeIndexHeader(metaOut, TERMS_META_CODEC_NAME, version, state.segmentInfo.getId(), state.segmentSuffix); + + postingsWriter.init(metaOut, state); // have consumer write its format/header + + this.metaOut = metaOut; + this.indexOut = indexOut; + success = true; + } finally { + if (success == false) { + IOUtils.closeWhileHandlingException(metaOut, termsOut, indexOut); + } + } + } + + /** Throws {@code IllegalArgumentException} if any of these settings is invalid. */ + public static void validateSettings(int minItemsInBlock, int maxItemsInBlock) { + if (minItemsInBlock <= 1) { + throw new IllegalArgumentException("minItemsInBlock must be >= 2; got " + minItemsInBlock); + } + if (minItemsInBlock > maxItemsInBlock) { + throw new IllegalArgumentException( + "maxItemsInBlock must be >= minItemsInBlock; got maxItemsInBlock=" + maxItemsInBlock + " minItemsInBlock=" + minItemsInBlock + ); + } + if (2 * (minItemsInBlock - 1) > maxItemsInBlock) { + throw new IllegalArgumentException( + "maxItemsInBlock must be at least 2*(minItemsInBlock-1); got maxItemsInBlock=" + + maxItemsInBlock + + " minItemsInBlock=" + + minItemsInBlock + ); + } + } + + @Override + public void write(Fields fields, NormsProducer norms) throws IOException { + // if (DEBUG) System.out.println("\nBTTW.write seg=" + segment); + + String lastField = null; + for (String field : fields) { + assert lastField == null || lastField.compareTo(field) < 0; + lastField = field; + + // if (DEBUG) System.out.println("\nBTTW.write seg=" + segment + " field=" + field); + Terms terms = fields.terms(field); + if (terms == null) { + continue; + } + + TermsEnum termsEnum = terms.iterator(); + TermsWriter termsWriter = new TermsWriter(fieldInfos.fieldInfo(field)); + while (true) { + BytesRef term = termsEnum.next(); + // if (DEBUG) System.out.println("BTTW: next term " + term); + + if (term == null) { + break; + } + + // if (DEBUG) System.out.println("write field=" + fieldInfo.name + " term=" + + // ToStringUtils.bytesRefToString(term)); + termsWriter.write(term, termsEnum, norms); + } + + termsWriter.finish(); + + // if (DEBUG) System.out.println("\nBTTW.write done seg=" + segment + " field=" + field); + } + } + + static long encodeOutput(long fp, boolean hasTerms, boolean isFloor) { + assert fp < (1L << 62); + return (fp << 2) | (hasTerms ? OUTPUT_FLAG_HAS_TERMS : 0) | (isFloor ? OUTPUT_FLAG_IS_FLOOR : 0); + } + + private static class PendingEntry { + public final boolean isTerm; + + protected PendingEntry(boolean isTerm) { + this.isTerm = isTerm; + } + } + + private static final class PendingTerm extends PendingEntry { + public final byte[] termBytes; + // stats + metadata + public final BlockTermState state; + + PendingTerm(BytesRef term, BlockTermState state) { + super(true); + this.termBytes = new byte[term.length]; + System.arraycopy(term.bytes, term.offset, termBytes, 0, term.length); + this.state = state; + } + + @Override + public String toString() { + return "TERM: " + ToStringUtils.bytesRefToString(termBytes); + } + } + + /** + * Encodes long value to variable length byte[], in MSB order. Use {@link + * FieldReader readMSBVLong} to decode. + * + *

Package private for testing + */ + static void writeMSBVLong(long l, DataOutput scratchBytes) throws IOException { + assert l >= 0; + // Keep zero bits on most significant byte to have more chance to get prefix bytes shared. + // e.g. we expect 0x7FFF stored as [0x81, 0xFF, 0x7F] but not [0xFF, 0xFF, 0x40] + final int bytesNeeded = (Long.SIZE - Long.numberOfLeadingZeros(l) - 1) / 7 + 1; + l <<= Long.SIZE - bytesNeeded * 7; + for (int i = 1; i < bytesNeeded; i++) { + scratchBytes.writeByte((byte) (((l >>> 57) & 0x7FL) | 0x80)); + l = l << 7; + } + scratchBytes.writeByte((byte) (((l >>> 57) & 0x7FL))); + } + + private final class PendingBlock extends PendingEntry { + public final BytesRef prefix; + public final long fp; + public FST index; + public List> subIndices; + public final boolean hasTerms; + public final boolean isFloor; + public final int floorLeadByte; + + PendingBlock(BytesRef prefix, long fp, boolean hasTerms, boolean isFloor, int floorLeadByte, List> subIndices) { + super(false); + this.prefix = prefix; + this.fp = fp; + this.hasTerms = hasTerms; + this.isFloor = isFloor; + this.floorLeadByte = floorLeadByte; + this.subIndices = subIndices; + } + + @Override + public String toString() { + return "BLOCK: prefix=" + ToStringUtils.bytesRefToString(prefix); + } + + public void compileIndex(List blocks, ByteBuffersDataOutput scratchBytes, IntsRefBuilder scratchIntsRef) + throws IOException { + + assert (isFloor && blocks.size() > 1) || (isFloor == false && blocks.size() == 1) : "isFloor=" + isFloor + " blocks=" + blocks; + assert this == blocks.get(0); + + assert scratchBytes.size() == 0; + + // write the leading vLong in MSB order for better outputs sharing in the FST + if (version >= Lucene90BlockTreeTermsReader.VERSION_MSB_VLONG_OUTPUT) { + writeMSBVLong(encodeOutput(fp, hasTerms, isFloor), scratchBytes); + } else { + scratchBytes.writeVLong(encodeOutput(fp, hasTerms, isFloor)); + } + if (isFloor) { + scratchBytes.writeVInt(blocks.size() - 1); + for (int i = 1; i < blocks.size(); i++) { + PendingBlock sub = blocks.get(i); + assert sub.floorLeadByte != -1; + // if (DEBUG) { + // System.out.println(" write floorLeadByte=" + + // Integer.toHexString(sub.floorLeadByte&0xff)); + // } + scratchBytes.writeByte((byte) sub.floorLeadByte); + assert sub.fp > fp; + scratchBytes.writeVLong((sub.fp - fp) << 1 | (sub.hasTerms ? 1 : 0)); + } + } + + long estimateSize = prefix.length; + for (PendingBlock block : blocks) { + if (block.subIndices != null) { + for (FST subIndex : block.subIndices) { + estimateSize += subIndex.numBytes(); + } + } + } + int estimateBitsRequired = PackedInts.bitsRequired(estimateSize); + int pageBits = Math.min(15, Math.max(6, estimateBitsRequired)); + + final ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton(); + final int fstVersion; + if (version >= Lucene90BlockTreeTermsReader.VERSION_CURRENT) { + fstVersion = FST.VERSION_CURRENT; + } else { + fstVersion = FST.VERSION_90; + } + final FSTCompiler fstCompiler = new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, outputs) + // Disable suffixes sharing for block tree index because suffixes are mostly dropped + // from the FST index and left in the term blocks. + .suffixRAMLimitMB(0d) + .dataOutput(getOnHeapReaderWriter(pageBits)) + .setVersion(fstVersion) + .build(); + // if (DEBUG) { + // System.out.println(" compile index for prefix=" + prefix); + // } + // indexBuilder.DEBUG = false; + final byte[] bytes = scratchBytes.toArrayCopy(); + assert bytes.length > 0; + fstCompiler.add(Util.toIntsRef(prefix, scratchIntsRef), new BytesRef(bytes, 0, bytes.length)); + scratchBytes.reset(); + + // Copy over index for all sub-blocks + for (PendingBlock block : blocks) { + if (block.subIndices != null) { + for (FST subIndex : block.subIndices) { + append(fstCompiler, subIndex, scratchIntsRef); + } + block.subIndices = null; + } + } + + index = FST.fromFSTReader(fstCompiler.compile(), fstCompiler.getFSTReader()); + + assert subIndices == null; + + /* + Writer w = new OutputStreamWriter(new FileOutputStream("out.dot")); + Util.toDot(index, w, false, false); + System.out.println("SAVED to out.dot"); + w.close(); + */ + } + + // TODO: maybe we could add bulk-add method to + // Builder? Takes FST and unions it w/ current + // FST. + private void append(FSTCompiler fstCompiler, FST subIndex, IntsRefBuilder scratchIntsRef) throws IOException { + final BytesRefFSTEnum subIndexEnum = new BytesRefFSTEnum<>(subIndex); + BytesRefFSTEnum.InputOutput indexEnt; + while ((indexEnt = subIndexEnum.next()) != null) { + // if (DEBUG) { + // System.out.println(" add sub=" + indexEnt.input + " " + indexEnt.input + " output=" + // + indexEnt.output); + // } + fstCompiler.add(Util.toIntsRef(indexEnt.input, scratchIntsRef), indexEnt.output); + } + } + } + + private final ByteBuffersDataOutput scratchBytes = ByteBuffersDataOutput.newResettableInstance(); + private final IntsRefBuilder scratchIntsRef = new IntsRefBuilder(); + + private static class StatsWriter { + + private final DataOutput out; + private final boolean hasFreqs; + private int singletonCount; + + StatsWriter(DataOutput out, boolean hasFreqs) { + this.out = out; + this.hasFreqs = hasFreqs; + } + + void add(int df, long ttf) throws IOException { + // Singletons (DF==1, TTF==1) are run-length encoded + if (df == 1 && (hasFreqs == false || ttf == 1)) { + singletonCount++; + } else { + finish(); + out.writeVInt(df << 1); + if (hasFreqs) { + out.writeVLong(ttf - df); + } + } + } + + void finish() throws IOException { + if (singletonCount > 0) { + out.writeVInt(((singletonCount - 1) << 1) | 1); + singletonCount = 0; + } + } + } + + class TermsWriter { + private final FieldInfo fieldInfo; + private long numTerms; + final FixedBitSet docsSeen; + long sumTotalTermFreq; + long sumDocFreq; + + // Records index into pending where the current prefix at that + // length "started"; for example, if current term starts with 't', + // startsByPrefix[0] is the index into pending for the first + // term/sub-block starting with 't'. We use this to figure out when + // to write a new block: + private final BytesRefBuilder lastTerm = new BytesRefBuilder(); + private int[] prefixStarts = new int[8]; + + // Pending stack of terms and blocks. As terms arrive (in sorted order) + // we append to this stack, and once the top of the stack has enough + // terms starting with a common prefix, we write a new block with + // those terms and replace those terms in the stack with a new block: + private final List pending = new ArrayList<>(); + + // Reused in writeBlocks: + private final List newBlocks = new ArrayList<>(); + + private PendingTerm firstPendingTerm; + private PendingTerm lastPendingTerm; + + /** Writes the top count entries in pending, using prevTerm to compute the prefix. */ + void writeBlocks(int prefixLength, int count) throws IOException { + + assert count > 0; + + // if (DEBUG2) { + // BytesRef br = new BytesRef(lastTerm.bytes()); + // br.length = prefixLength; + // System.out.println("writeBlocks: seg=" + segment + " prefix=" + + // ToStringUtils.bytesRefToString(br) + " count=" + count); + // } + + // Root block better write all remaining pending entries: + assert prefixLength > 0 || count == pending.size(); + + int lastSuffixLeadLabel = -1; + + // True if we saw at least one term in this block (we record if a block + // only points to sub-blocks in the terms index so we can avoid seeking + // to it when we are looking for a term): + boolean hasTerms = false; + boolean hasSubBlocks = false; + + int start = pending.size() - count; + int end = pending.size(); + int nextBlockStart = start; + int nextFloorLeadLabel = -1; + + for (int i = start; i < end; i++) { + + PendingEntry ent = pending.get(i); + + int suffixLeadLabel; + + if (ent.isTerm) { + PendingTerm term = (PendingTerm) ent; + if (term.termBytes.length == prefixLength) { + // Suffix is 0, i.e. prefix 'foo' and term is + // 'foo' so the term has empty string suffix + // in this block + assert lastSuffixLeadLabel == -1 : "i=" + i + " lastSuffixLeadLabel=" + lastSuffixLeadLabel; + suffixLeadLabel = -1; + } else { + suffixLeadLabel = term.termBytes[prefixLength] & 0xff; + } + } else { + PendingBlock block = (PendingBlock) ent; + assert block.prefix.length > prefixLength; + suffixLeadLabel = block.prefix.bytes[block.prefix.offset + prefixLength] & 0xff; + } + // if (DEBUG) System.out.println(" i=" + i + " ent=" + ent + " suffixLeadLabel=" + + // suffixLeadLabel); + + if (suffixLeadLabel != lastSuffixLeadLabel) { + int itemsInBlock = i - nextBlockStart; + if (itemsInBlock >= minItemsInBlock && end - nextBlockStart > maxItemsInBlock) { + // The count is too large for one block, so we must break it into "floor" blocks, where + // we record + // the leading label of the suffix of the first term in each floor block, so at search + // time we can + // jump to the right floor block. We just use a naive greedy segmenter here: make a new + // floor + // block as soon as we have at least minItemsInBlock. This is not always best: it often + // produces + // a too-small block as the final block: + boolean isFloor = itemsInBlock < count; + newBlocks.add(writeBlock(prefixLength, isFloor, nextFloorLeadLabel, nextBlockStart, i, hasTerms, hasSubBlocks)); + + hasTerms = false; + hasSubBlocks = false; + nextFloorLeadLabel = suffixLeadLabel; + nextBlockStart = i; + } + + lastSuffixLeadLabel = suffixLeadLabel; + } + + if (ent.isTerm) { + hasTerms = true; + } else { + hasSubBlocks = true; + } + } + + // Write last block, if any: + if (nextBlockStart < end) { + int itemsInBlock = end - nextBlockStart; + boolean isFloor = itemsInBlock < count; + newBlocks.add(writeBlock(prefixLength, isFloor, nextFloorLeadLabel, nextBlockStart, end, hasTerms, hasSubBlocks)); + } + + assert newBlocks.isEmpty() == false; + + PendingBlock firstBlock = newBlocks.get(0); + + assert firstBlock.isFloor || newBlocks.size() == 1; + + firstBlock.compileIndex(newBlocks, scratchBytes, scratchIntsRef); + + // Remove slice from the top of the pending stack, that we just wrote: + pending.subList(pending.size() - count, pending.size()).clear(); + + // Append new block + pending.add(firstBlock); + + newBlocks.clear(); + } + + private boolean allEqual(byte[] b, int startOffset, int endOffset, byte value) { + Objects.checkFromToIndex(startOffset, endOffset, b.length); + for (int i = startOffset; i < endOffset; ++i) { + if (b[i] != value) { + return false; + } + } + return true; + } + + /** + * Writes the specified slice (start is inclusive, end is exclusive) from pending stack as a new + * block. If isFloor is true, there were too many (more than maxItemsInBlock) entries sharing + * the same prefix, and so we broke it into multiple floor blocks where we record the starting + * label of the suffix of each floor block. + */ + private PendingBlock writeBlock( + int prefixLength, + boolean isFloor, + int floorLeadLabel, + int start, + int end, + boolean hasTerms, + boolean hasSubBlocks + ) throws IOException { + + assert end > start; + + long startFP = termsOut.getFilePointer(); + + boolean hasFloorLeadLabel = isFloor && floorLeadLabel != -1; + + final BytesRef prefix = new BytesRef(prefixLength + (hasFloorLeadLabel ? 1 : 0)); + System.arraycopy(lastTerm.get().bytes, 0, prefix.bytes, 0, prefixLength); + prefix.length = prefixLength; + + // if (DEBUG2) System.out.println(" writeBlock field=" + fieldInfo.name + " prefix=" + + // ToStringUtils.bytesRefToString(prefix) + " fp=" + startFP + " isFloor=" + isFloor + + // " isLastInFloor=" + (end == pending.size()) + " floorLeadLabel=" + floorLeadLabel + + // " start=" + start + " end=" + end + " hasTerms=" + hasTerms + " hasSubBlocks=" + + // hasSubBlocks); + + // Write block header: + int numEntries = end - start; + int code = numEntries << 1; + if (end == pending.size()) { + // Last block: + code |= 1; + } + termsOut.writeVInt(code); + + /* + if (DEBUG) { + System.out.println(" writeBlock " + (isFloor ? "(floor) " : "") + "seg=" + segment + " pending.size()=" + + pending.size() + " prefixLength=" + prefixLength + " indexPrefix=" + ToStringUtils.bytesRefToString(prefix) + + " entCount="+(end-start+1) +" startFP="+startFP+(isFloor ? (" floorLeadLabel=" + Integer.toHexString(floorLeadLabel)) : "")); + } + */ + + // 1st pass: pack term suffix bytes into byte[] blob + // TODO: cutover to bulk int codec... simple64? + + // We optimize the leaf block case (block has only terms), writing a more + // compact format in this case: + boolean isLeafBlock = hasSubBlocks == false; + + // System.out.println(" isLeaf=" + isLeafBlock); + + final List> subIndices; + + boolean absolute = true; + + if (isLeafBlock) { + // Block contains only ordinary terms: + subIndices = null; + StatsWriter statsWriter = new StatsWriter(this.statsWriter, fieldInfo.getIndexOptions() != IndexOptions.DOCS); + for (int i = start; i < end; i++) { + PendingEntry ent = pending.get(i); + assert ent.isTerm : "i=" + i; + + PendingTerm term = (PendingTerm) ent; + + assert StringHelper.startsWith(term.termBytes, prefix) : term + " prefix=" + prefix; + BlockTermState state = term.state; + final int suffix = term.termBytes.length - prefixLength; + // if (DEBUG2) { + // BytesRef suffixBytes = new BytesRef(suffix); + // System.arraycopy(term.termBytes, prefixLength, suffixBytes.bytes, 0, suffix); + // suffixBytes.length = suffix; + // System.out.println(" write term suffix=" + + // ToStringUtils.bytesRefToString(suffixBytes)); + // } + + // For leaf block we write suffix straight + suffixLengthsWriter.writeVInt(suffix); + suffixWriter.append(term.termBytes, prefixLength, suffix); + assert floorLeadLabel == -1 || (term.termBytes[prefixLength] & 0xff) >= floorLeadLabel; + + // Write term stats, to separate byte[] blob: + statsWriter.add(state.docFreq, state.totalTermFreq); + + // Write term meta data + postingsWriter.encodeTerm(metaWriter, fieldInfo, state, absolute); + absolute = false; + } + statsWriter.finish(); + } else { + // Block has at least one prefix term or a sub block: + subIndices = new ArrayList<>(); + StatsWriter statsWriter = new StatsWriter(this.statsWriter, fieldInfo.getIndexOptions() != IndexOptions.DOCS); + for (int i = start; i < end; i++) { + PendingEntry ent = pending.get(i); + if (ent.isTerm) { + PendingTerm term = (PendingTerm) ent; + + assert StringHelper.startsWith(term.termBytes, prefix) : term + " prefix=" + prefix; + BlockTermState state = term.state; + final int suffix = term.termBytes.length - prefixLength; + // if (DEBUG2) { + // BytesRef suffixBytes = new BytesRef(suffix); + // System.arraycopy(term.termBytes, prefixLength, suffixBytes.bytes, 0, suffix); + // suffixBytes.length = suffix; + // System.out.println(" write term suffix=" + + // ToStringUtils.bytesRefToString(suffixBytes)); + // } + + // For non-leaf block we borrow 1 bit to record + // if entry is term or sub-block, and 1 bit to record if + // it's a prefix term. Terms cannot be larger than ~32 KB + // so we won't run out of bits: + + suffixLengthsWriter.writeVInt(suffix << 1); + suffixWriter.append(term.termBytes, prefixLength, suffix); + + // Write term stats, to separate byte[] blob: + statsWriter.add(state.docFreq, state.totalTermFreq); + + // TODO: now that terms dict "sees" these longs, + // we can explore better column-stride encodings + // to encode all long[0]s for this block at + // once, all long[1]s, etc., e.g. using + // Simple64. Alternatively, we could interleave + // stats + meta ... no reason to have them + // separate anymore: + + // Write term meta data + postingsWriter.encodeTerm(metaWriter, fieldInfo, state, absolute); + absolute = false; + } else { + PendingBlock block = (PendingBlock) ent; + assert StringHelper.startsWith(block.prefix, prefix); + final int suffix = block.prefix.length - prefixLength; + assert StringHelper.startsWith(block.prefix, prefix); + + assert suffix > 0; + + // For non-leaf block we borrow 1 bit to record + // if entry is term or sub-block:f + suffixLengthsWriter.writeVInt((suffix << 1) | 1); + suffixWriter.append(block.prefix.bytes, prefixLength, suffix); + + // if (DEBUG2) { + // BytesRef suffixBytes = new BytesRef(suffix); + // System.arraycopy(block.prefix.bytes, prefixLength, suffixBytes.bytes, 0, suffix); + // suffixBytes.length = suffix; + // System.out.println(" write sub-block suffix=" + + // ToStringUtils.bytesRefToString(suffixBytes) + " subFP=" + block.fp + " subCode=" + + // (startFP-block.fp) + " floor=" + block.isFloor); + // } + + assert floorLeadLabel == -1 || (block.prefix.bytes[prefixLength] & 0xff) >= floorLeadLabel + : "floorLeadLabel=" + floorLeadLabel + " suffixLead=" + (block.prefix.bytes[prefixLength] & 0xff); + assert block.fp < startFP; + + suffixLengthsWriter.writeVLong(startFP - block.fp); + subIndices.add(block.index); + } + } + statsWriter.finish(); + + assert subIndices.size() != 0; + } + + // Write suffixes byte[] blob to terms dict output, either uncompressed, compressed with LZ4 + // or with LowercaseAsciiCompression. + CompressionAlgorithm compressionAlg = CompressionAlgorithm.NO_COMPRESSION; + // If there are 2 suffix bytes or less per term, then we don't bother compressing as suffix + // are unlikely what + // makes the terms dictionary large, and it also tends to be frequently the case for dense IDs + // like + // auto-increment IDs, so not compressing in that case helps not hurt ID lookups by too much. + // We also only start compressing when the prefix length is greater than 2 since blocks whose + // prefix length is + // 1 or 2 always all get visited when running a fuzzy query whose max number of edits is 2. + if (suffixWriter.length() > 2L * numEntries && prefixLength > 2) { + // LZ4 inserts references whenever it sees duplicate strings of 4 chars or more, so only try + // it out if the + // average suffix length is greater than 6. + if (suffixWriter.length() > 6L * numEntries) { + if (compressionHashTable == null) { + compressionHashTable = new LZ4.HighCompressionHashTable(); + } + LZ4.compress(suffixWriter.bytes(), 0, suffixWriter.length(), spareWriter, compressionHashTable); + if (spareWriter.size() < suffixWriter.length() - (suffixWriter.length() >>> 2)) { + // LZ4 saved more than 25%, go for it + compressionAlg = CompressionAlgorithm.LZ4; + } + } + if (compressionAlg == CompressionAlgorithm.NO_COMPRESSION) { + spareWriter.reset(); + if (spareBytes.length < suffixWriter.length()) { + spareBytes = new byte[ArrayUtil.oversize(suffixWriter.length(), 1)]; + } + if (LowercaseAsciiCompression.compress(suffixWriter.bytes(), suffixWriter.length(), spareBytes, spareWriter)) { + compressionAlg = CompressionAlgorithm.LOWERCASE_ASCII; + } + } + } + long token = ((long) suffixWriter.length()) << 3; + if (isLeafBlock) { + token |= 0x04; + } + token |= compressionAlg.code; + termsOut.writeVLong(token); + if (compressionAlg == CompressionAlgorithm.NO_COMPRESSION) { + termsOut.writeBytes(suffixWriter.bytes(), suffixWriter.length()); + } else { + spareWriter.copyTo(termsOut); + } + suffixWriter.setLength(0); + spareWriter.reset(); + + // Write suffix lengths + final int numSuffixBytes = Math.toIntExact(suffixLengthsWriter.size()); + spareBytes = ArrayUtil.growNoCopy(spareBytes, numSuffixBytes); + suffixLengthsWriter.copyTo(new ByteArrayDataOutput(spareBytes)); + suffixLengthsWriter.reset(); + if (allEqual(spareBytes, 1, numSuffixBytes, spareBytes[0])) { + // Structured fields like IDs often have most values of the same length + termsOut.writeVInt((numSuffixBytes << 1) | 1); + termsOut.writeByte(spareBytes[0]); + } else { + termsOut.writeVInt(numSuffixBytes << 1); + termsOut.writeBytes(spareBytes, numSuffixBytes); + } + + // Stats + final int numStatsBytes = Math.toIntExact(statsWriter.size()); + termsOut.writeVInt(numStatsBytes); + statsWriter.copyTo(termsOut); + statsWriter.reset(); + + // Write term meta data byte[] blob + termsOut.writeVInt((int) metaWriter.size()); + metaWriter.copyTo(termsOut); + metaWriter.reset(); + + // if (DEBUG) { + // System.out.println(" fpEnd=" + out.getFilePointer()); + // } + + if (hasFloorLeadLabel) { + // We already allocated to length+1 above: + prefix.bytes[prefix.length++] = (byte) floorLeadLabel; + } + + return new PendingBlock(prefix, startFP, hasTerms, isFloor, floorLeadLabel, subIndices); + } + + TermsWriter(FieldInfo fieldInfo) { + this.fieldInfo = fieldInfo; + assert fieldInfo.getIndexOptions() != IndexOptions.NONE; + docsSeen = new FixedBitSet(maxDoc); + postingsWriter.setField(fieldInfo); + } + + /** Writes one term's worth of postings. */ + public void write(BytesRef text, TermsEnum termsEnum, NormsProducer norms) throws IOException { + /* + if (DEBUG) { + int[] tmp = new int[lastTerm.length]; + System.arraycopy(prefixStarts, 0, tmp, 0, tmp.length); + System.out.println("BTTW: write term=" + ToStringUtils.bytesRefToString(text) + " prefixStarts=" + Arrays.toString(tmp) + + " pending.size()=" + pending.size()); + } + */ + + BlockTermState state = postingsWriter.writeTerm(text, termsEnum, docsSeen, norms); + if (state != null) { + + assert state.docFreq != 0; + assert fieldInfo.getIndexOptions() == IndexOptions.DOCS || state.totalTermFreq >= state.docFreq + : "postingsWriter=" + postingsWriter; + pushTerm(text); + + PendingTerm term = new PendingTerm(text, state); + pending.add(term); + // if (DEBUG) System.out.println(" add pending term = " + text + " pending.size()=" + + // pending.size()); + + sumDocFreq += state.docFreq; + sumTotalTermFreq += state.totalTermFreq; + numTerms++; + if (firstPendingTerm == null) { + firstPendingTerm = term; + } + lastPendingTerm = term; + } + } + + /** Pushes the new term to the top of the stack, and writes new blocks. */ + private void pushTerm(BytesRef text) throws IOException { + // Find common prefix between last term and current term: + int prefixLength = Arrays.mismatch(lastTerm.bytes(), 0, lastTerm.length(), text.bytes, text.offset, text.offset + text.length); + if (prefixLength == -1) { // Only happens for the first term, if it is empty + assert lastTerm.length() == 0; + prefixLength = 0; + } + + // if (DEBUG) System.out.println(" shared=" + pos + " lastTerm.length=" + lastTerm.length); + + // Close the "abandoned" suffix now: + for (int i = lastTerm.length() - 1; i >= prefixLength; i--) { + + // How many items on top of the stack share the current suffix + // we are closing: + int prefixTopSize = pending.size() - prefixStarts[i]; + if (prefixTopSize >= minItemsInBlock) { + // if (DEBUG) System.out.println("pushTerm i=" + i + " prefixTopSize=" + prefixTopSize + + // " minItemsInBlock=" + minItemsInBlock); + writeBlocks(i + 1, prefixTopSize); + prefixStarts[i] -= prefixTopSize - 1; + } + } + + if (prefixStarts.length < text.length) { + prefixStarts = ArrayUtil.grow(prefixStarts, text.length); + } + + // Init new tail: + for (int i = prefixLength; i < text.length; i++) { + prefixStarts[i] = pending.size(); + } + + lastTerm.copyBytes(text); + } + + // Finishes all terms in this field + public void finish() throws IOException { + if (numTerms > 0) { + // if (DEBUG) System.out.println("BTTW: finish prefixStarts=" + + // Arrays.toString(prefixStarts)); + + // Add empty term to force closing of all final blocks: + pushTerm(new BytesRef()); + + // TODO: if pending.size() is already 1 with a non-zero prefix length + // we can save writing a "degenerate" root block, but we have to + // fix all the places that assume the root block's prefix is the empty string: + pushTerm(new BytesRef()); + writeBlocks(0, pending.size()); + + // We better have one final "root" block: + assert pending.size() == 1 && pending.get(0).isTerm == false : "pending.size()=" + pending.size() + " pending=" + pending; + final PendingBlock root = (PendingBlock) pending.get(0); + assert root.prefix.length == 0; + final BytesRef rootCode = root.index.getEmptyOutput(); + assert rootCode != null; + + ByteBuffersDataOutput metaOut = new ByteBuffersDataOutput(); + fields.add(metaOut); + + metaOut.writeVInt(fieldInfo.number); + metaOut.writeVLong(numTerms); + metaOut.writeVInt(rootCode.length); + metaOut.writeBytes(rootCode.bytes, rootCode.offset, rootCode.length); + assert fieldInfo.getIndexOptions() != IndexOptions.NONE; + if (fieldInfo.getIndexOptions() != IndexOptions.DOCS) { + metaOut.writeVLong(sumTotalTermFreq); + } + metaOut.writeVLong(sumDocFreq); + metaOut.writeVInt(docsSeen.cardinality()); + writeBytesRef(metaOut, new BytesRef(firstPendingTerm.termBytes)); + writeBytesRef(metaOut, new BytesRef(lastPendingTerm.termBytes)); + metaOut.writeVLong(indexOut.getFilePointer()); + // Write FST to index + root.index.save(metaOut, indexOut); + // System.out.println(" write FST " + indexStartFP + " field=" + fieldInfo.name); + + /* + if (DEBUG) { + final String dotFileName = segment + "_" + fieldInfo.name + ".dot"; + Writer w = new OutputStreamWriter(new FileOutputStream(dotFileName)); + Util.toDot(root.index, w, false, false); + System.out.println("SAVED to " + dotFileName); + w.close(); + } + */ + + } else { + assert sumTotalTermFreq == 0 || fieldInfo.getIndexOptions() == IndexOptions.DOCS && sumTotalTermFreq == -1; + assert sumDocFreq == 0; + assert docsSeen.cardinality() == 0; + } + } + + private final ByteBuffersDataOutput suffixLengthsWriter = ByteBuffersDataOutput.newResettableInstance(); + private final BytesRefBuilder suffixWriter = new BytesRefBuilder(); + private final ByteBuffersDataOutput statsWriter = ByteBuffersDataOutput.newResettableInstance(); + private final ByteBuffersDataOutput metaWriter = ByteBuffersDataOutput.newResettableInstance(); + private final ByteBuffersDataOutput spareWriter = ByteBuffersDataOutput.newResettableInstance(); + private byte[] spareBytes = BytesRef.EMPTY_BYTES; + private LZ4.HighCompressionHashTable compressionHashTable; + } + + private boolean closed; + + @Override + public void close() throws IOException { + if (closed) { + return; + } + closed = true; + + boolean success = false; + try { + metaOut.writeVInt(fields.size()); + for (ByteBuffersDataOutput fieldMeta : fields) { + fieldMeta.copyTo(metaOut); + } + CodecUtil.writeFooter(indexOut); + metaOut.writeLong(indexOut.getFilePointer()); + CodecUtil.writeFooter(termsOut); + metaOut.writeLong(termsOut.getFilePointer()); + CodecUtil.writeFooter(metaOut); + success = true; + } finally { + if (success) { + IOUtils.close(metaOut, termsOut, indexOut, postingsWriter); + } else { + IOUtils.closeWhileHandlingException(metaOut, termsOut, indexOut, postingsWriter); + } + } + } + + private static void writeBytesRef(DataOutput out, BytesRef bytes) throws IOException { + out.writeVInt(bytes.length); + out.writeBytes(bytes.bytes, bytes.offset, bytes.length); + } +} diff --git a/server/src/main/java/org/elasticsearch/index/codec/tsdb/es819/ES819TSDBDocValuesProducer.java b/server/src/main/java/org/elasticsearch/index/codec/tsdb/es819/ES819TSDBDocValuesProducer.java index f2bb92cffadd1..5d90f2814853d 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/tsdb/es819/ES819TSDBDocValuesProducer.java +++ b/server/src/main/java/org/elasticsearch/index/codec/tsdb/es819/ES819TSDBDocValuesProducer.java @@ -304,6 +304,11 @@ public boolean advanceExact(int target) throws IOException { doc = target; return true; } + + @Override + public int docIDRunEnd() throws IOException { + return maxDoc; + } } private abstract static class SparseBinaryDocValues extends BinaryDocValues { @@ -338,6 +343,11 @@ public int advance(int target) throws IOException { public boolean advanceExact(int target) throws IOException { return disi.advanceExact(target); } + + @Override + public int docIDRunEnd() throws IOException { + return disi.docIDRunEnd(); + } } @Override @@ -384,6 +394,11 @@ public long cost() { return ords.cost(); } + @Override + public int docIDRunEnd() throws IOException { + return ords.docIDRunEnd(); + } + @Override public BlockLoader.Block tryRead( BlockLoader.BlockFactory factory, @@ -926,6 +941,11 @@ public int advance(int target) throws IOException { public long cost() { return ords.cost(); } + + @Override + public int docIDRunEnd() throws IOException { + return ords.docIDRunEnd(); + } }; } @@ -1297,6 +1317,11 @@ public long longValue() { return 0L; } + @Override + public int docIDRunEnd() { + return maxDoc; + } + @Override long lookAheadValueAt(int targetDoc) throws IOException { return 0L; // Only one ordinal! @@ -1316,6 +1341,11 @@ long lookAheadValueAt(int targetDoc) throws IOException { public long longValue() throws IOException { return 0L; // Only one ordinal! } + + @Override + public int docIDRunEnd() throws IOException { + return disi.docIDRunEnd(); + } }; } } else if (entry.sortedOrdinals != null) { @@ -1341,6 +1371,11 @@ public long longValue() throws IOException { private long[] lookaheadBlock; private IndexInput lookaheadData = null; + @Override + public int docIDRunEnd() { + return maxDoc; + } + @Override public long longValue() throws IOException { final int index = doc; @@ -1469,6 +1504,11 @@ static boolean isDense(int firstDocId, int lastDocId, int length) { private long currentBlockIndex = -1; private final long[] currentBlock = new long[ES819TSDBDocValuesFormat.NUMERIC_BLOCK_SIZE]; + @Override + public int docIDRunEnd() throws IOException { + return disi.docIDRunEnd(); + } + @Override public long longValue() throws IOException { final int index = disi.index(); @@ -1578,6 +1618,11 @@ long lookAheadValueAt(int targetDoc) { public long longValue() { return ordinalsReader.readValueAndAdvance(doc); } + + @Override + public int docIDRunEnd() throws IOException { + return maxDoc; + } }; } else { final var disi = new IndexedDISI( @@ -1593,6 +1638,11 @@ public long longValue() { public long longValue() { return ordinalsReader.readValueAndAdvance(disi.docID()); } + + @Override + public int docIDRunEnd() throws IOException { + return disi.docIDRunEnd(); + } }; } } @@ -1697,6 +1747,11 @@ public long nextValue() throws IOException { public int docValueCount() { return count; } + + @Override + public int docIDRunEnd() { + return maxDoc; + } }; } else { // sparse @@ -1754,6 +1809,11 @@ public int docValueCount() { return count; } + @Override + public int docIDRunEnd() throws IOException { + return disi.docIDRunEnd(); + } + private void set() { if (set == false) { final int index = disi.index(); diff --git a/server/src/main/java/org/elasticsearch/index/codec/vectors/ES813FlatVectorFormat.java b/server/src/main/java/org/elasticsearch/index/codec/vectors/ES813FlatVectorFormat.java index 29f62b64764a9..ef043422a37be 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/vectors/ES813FlatVectorFormat.java +++ b/server/src/main/java/org/elasticsearch/index/codec/vectors/ES813FlatVectorFormat.java @@ -25,12 +25,11 @@ import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.index.Sorter; +import org.apache.lucene.search.AcceptDocs; import org.apache.lucene.search.KnnCollector; import org.apache.lucene.util.Bits; import org.apache.lucene.util.hnsw.OrdinalTranslatedKnnCollector; import org.apache.lucene.util.hnsw.RandomVectorScorer; -import org.elasticsearch.index.codec.vectors.reflect.OffHeapByteSizeUtils; -import org.elasticsearch.index.codec.vectors.reflect.OffHeapStats; import java.io.IOException; import java.util.Map; @@ -105,7 +104,7 @@ public void mergeOneField(FieldInfo fieldInfo, MergeState mergeState) throws IOE } } - static class ES813FlatVectorReader extends KnnVectorsReader implements OffHeapStats { + static class ES813FlatVectorReader extends KnnVectorsReader { private final FlatVectorsReader reader; @@ -130,13 +129,14 @@ public ByteVectorValues getByteVectorValues(String field) throws IOException { } @Override - public void search(String field, float[] target, KnnCollector knnCollector, Bits acceptDocs) throws IOException { + public void search(String field, float[] target, KnnCollector knnCollector, AcceptDocs acceptDocs) throws IOException { collectAllMatchingDocs(knnCollector, acceptDocs, reader.getRandomVectorScorer(field, target)); } - private void collectAllMatchingDocs(KnnCollector knnCollector, Bits acceptDocs, RandomVectorScorer scorer) throws IOException { + private void collectAllMatchingDocs(KnnCollector knnCollector, AcceptDocs acceptDocs, RandomVectorScorer scorer) + throws IOException { OrdinalTranslatedKnnCollector collector = new OrdinalTranslatedKnnCollector(knnCollector, scorer::ordToDoc); - Bits acceptedOrds = scorer.getAcceptOrds(acceptDocs); + Bits acceptedOrds = scorer.getAcceptOrds(acceptDocs.bits()); for (int i = 0; i < scorer.maxOrd(); i++) { if (acceptedOrds == null || acceptedOrds.get(i)) { collector.collect(i, scorer.score(i)); @@ -147,18 +147,18 @@ private void collectAllMatchingDocs(KnnCollector knnCollector, Bits acceptDocs, } @Override - public void search(String field, byte[] target, KnnCollector knnCollector, Bits acceptDocs) throws IOException { + public void search(String field, byte[] target, KnnCollector knnCollector, AcceptDocs acceptDocs) throws IOException { collectAllMatchingDocs(knnCollector, acceptDocs, reader.getRandomVectorScorer(field, target)); } @Override - public void close() throws IOException { - reader.close(); + public Map getOffHeapByteSize(FieldInfo fieldInfo) { + return reader.getOffHeapByteSize(fieldInfo); } @Override - public Map getOffHeapByteSize(FieldInfo fieldInfo) { - return OffHeapByteSizeUtils.getOffHeapByteSize(reader, fieldInfo); + public void close() throws IOException { + reader.close(); } } } diff --git a/server/src/main/java/org/elasticsearch/index/codec/vectors/ES813Int8FlatVectorFormat.java b/server/src/main/java/org/elasticsearch/index/codec/vectors/ES813Int8FlatVectorFormat.java index 8dd4f686a6dea..dd9c1dced748d 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/vectors/ES813Int8FlatVectorFormat.java +++ b/server/src/main/java/org/elasticsearch/index/codec/vectors/ES813Int8FlatVectorFormat.java @@ -23,12 +23,11 @@ import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.index.Sorter; +import org.apache.lucene.search.AcceptDocs; import org.apache.lucene.search.KnnCollector; import org.apache.lucene.util.Bits; import org.apache.lucene.util.hnsw.OrdinalTranslatedKnnCollector; import org.apache.lucene.util.hnsw.RandomVectorScorer; -import org.elasticsearch.index.codec.vectors.reflect.OffHeapByteSizeUtils; -import org.elasticsearch.index.codec.vectors.reflect.OffHeapStats; import java.io.IOException; import java.util.Map; @@ -113,7 +112,7 @@ public void mergeOneField(FieldInfo fieldInfo, MergeState mergeState) throws IOE } } - public static class ES813FlatVectorReader extends KnnVectorsReader implements OffHeapStats { + public static class ES813FlatVectorReader extends KnnVectorsReader { private final FlatVectorsReader reader; @@ -138,13 +137,14 @@ public ByteVectorValues getByteVectorValues(String field) throws IOException { } @Override - public void search(String field, float[] target, KnnCollector knnCollector, Bits acceptDocs) throws IOException { + public void search(String field, float[] target, KnnCollector knnCollector, AcceptDocs acceptDocs) throws IOException { collectAllMatchingDocs(knnCollector, acceptDocs, reader.getRandomVectorScorer(field, target)); } - private void collectAllMatchingDocs(KnnCollector knnCollector, Bits acceptDocs, RandomVectorScorer scorer) throws IOException { + private void collectAllMatchingDocs(KnnCollector knnCollector, AcceptDocs acceptDocs, RandomVectorScorer scorer) + throws IOException { OrdinalTranslatedKnnCollector collector = new OrdinalTranslatedKnnCollector(knnCollector, scorer::ordToDoc); - Bits acceptedOrds = scorer.getAcceptOrds(acceptDocs); + Bits acceptedOrds = scorer.getAcceptOrds(acceptDocs.bits()); for (int i = 0; i < scorer.maxOrd(); i++) { if (acceptedOrds == null || acceptedOrds.get(i)) { collector.collect(i, scorer.score(i)); @@ -155,18 +155,18 @@ private void collectAllMatchingDocs(KnnCollector knnCollector, Bits acceptDocs, } @Override - public void search(String field, byte[] target, KnnCollector knnCollector, Bits acceptDocs) throws IOException { + public void search(String field, byte[] target, KnnCollector knnCollector, AcceptDocs acceptDocs) throws IOException { collectAllMatchingDocs(knnCollector, acceptDocs, reader.getRandomVectorScorer(field, target)); } @Override - public void close() throws IOException { - reader.close(); + public Map getOffHeapByteSize(FieldInfo fieldInfo) { + return reader.getOffHeapByteSize(fieldInfo); } @Override - public Map getOffHeapByteSize(FieldInfo fieldInfo) { - return OffHeapByteSizeUtils.getOffHeapByteSize(reader, fieldInfo); + public void close() throws IOException { + reader.close(); } } } diff --git a/server/src/main/java/org/elasticsearch/index/codec/vectors/ES814ScalarQuantizedVectorsFormat.java b/server/src/main/java/org/elasticsearch/index/codec/vectors/ES814ScalarQuantizedVectorsFormat.java index 3dde6fab00d4c..56710d49b5a7a 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/vectors/ES814ScalarQuantizedVectorsFormat.java +++ b/server/src/main/java/org/elasticsearch/index/codec/vectors/ES814ScalarQuantizedVectorsFormat.java @@ -34,8 +34,6 @@ import org.apache.lucene.util.quantization.QuantizedByteVectorValues; import org.apache.lucene.util.quantization.QuantizedVectorsReader; import org.apache.lucene.util.quantization.ScalarQuantizer; -import org.elasticsearch.index.codec.vectors.reflect.OffHeapByteSizeUtils; -import org.elasticsearch.index.codec.vectors.reflect.OffHeapStats; import org.elasticsearch.simdvec.VectorScorerFactory; import org.elasticsearch.simdvec.VectorSimilarityType; @@ -177,7 +175,7 @@ public long ramBytesUsed() { } } - static final class ES814ScalarQuantizedVectorsReader extends FlatVectorsReader implements QuantizedVectorsReader, OffHeapStats { + static final class ES814ScalarQuantizedVectorsReader extends FlatVectorsReader implements QuantizedVectorsReader { final Lucene99ScalarQuantizedVectorsReader delegate; @@ -233,7 +231,7 @@ public long ramBytesUsed() { @Override public Map getOffHeapByteSize(FieldInfo fieldInfo) { - return OffHeapByteSizeUtils.getOffHeapByteSize(delegate, fieldInfo); + return delegate.getOffHeapByteSize(fieldInfo); } } diff --git a/server/src/main/java/org/elasticsearch/index/codec/vectors/MergeReaderWrapper.java b/server/src/main/java/org/elasticsearch/index/codec/vectors/MergeReaderWrapper.java index b8376f02e7f47..3c63ae744f451 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/vectors/MergeReaderWrapper.java +++ b/server/src/main/java/org/elasticsearch/index/codec/vectors/MergeReaderWrapper.java @@ -13,19 +13,17 @@ import org.apache.lucene.index.ByteVectorValues; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FloatVectorValues; +import org.apache.lucene.search.AcceptDocs; import org.apache.lucene.search.KnnCollector; import org.apache.lucene.util.Accountable; -import org.apache.lucene.util.Bits; import org.apache.lucene.util.hnsw.RandomVectorScorer; import org.elasticsearch.core.IOUtils; -import org.elasticsearch.index.codec.vectors.reflect.OffHeapByteSizeUtils; -import org.elasticsearch.index.codec.vectors.reflect.OffHeapStats; import java.io.IOException; import java.util.Collection; import java.util.Map; -public class MergeReaderWrapper extends FlatVectorsReader implements OffHeapStats { +public class MergeReaderWrapper extends FlatVectorsReader { private final FlatVectorsReader mainReader; private final FlatVectorsReader mergeReader; @@ -36,11 +34,6 @@ public MergeReaderWrapper(FlatVectorsReader mainReader, FlatVectorsReader mergeR this.mergeReader = mergeReader; } - // For testing - public FlatVectorsReader getMainReader() { - return mainReader; - } - @Override public RandomVectorScorer getRandomVectorScorer(String field, float[] target) throws IOException { return mainReader.getRandomVectorScorer(field, target); @@ -66,6 +59,16 @@ public ByteVectorValues getByteVectorValues(String field) throws IOException { return mainReader.getByteVectorValues(field); } + @Override + public void search(String field, float[] target, KnnCollector knnCollector, AcceptDocs acceptDocs) throws IOException { + mainReader.search(field, target, knnCollector, acceptDocs); + } + + @Override + public void search(String field, byte[] target, KnnCollector knnCollector, AcceptDocs acceptDocs) throws IOException { + mainReader.search(field, target, knnCollector, acceptDocs); + } + @Override public FlatVectorsReader getMergeInstance() { return mergeReader; @@ -82,22 +85,14 @@ public Collection getChildResources() { } @Override - public void search(String field, float[] target, KnnCollector knnCollector, Bits acceptDocs) throws IOException { - mainReader.search(field, target, knnCollector, acceptDocs); - } - - @Override - public void search(String field, byte[] target, KnnCollector knnCollector, Bits acceptDocs) throws IOException { - mainReader.search(field, target, knnCollector, acceptDocs); + public Map getOffHeapByteSize(FieldInfo fieldInfo) { + // TODO: https://github.com/elastic/elasticsearch/issues/128672 + // return mainReader.getOffHeapByteSize(fieldInfo); + return Map.of(); // no off-heap when using direct IO } @Override public void close() throws IOException { IOUtils.close(mainReader, mergeReader); } - - @Override - public Map getOffHeapByteSize(FieldInfo fieldInfo) { - return OffHeapByteSizeUtils.getOffHeapByteSize(mainReader, fieldInfo); - } } diff --git a/server/src/main/java/org/elasticsearch/index/codec/vectors/diskbbq/ES920DiskBBQVectorsReader.java b/server/src/main/java/org/elasticsearch/index/codec/vectors/diskbbq/ES920DiskBBQVectorsReader.java index 45a8c281028ae..937aedd9236e5 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/vectors/diskbbq/ES920DiskBBQVectorsReader.java +++ b/server/src/main/java/org/elasticsearch/index/codec/vectors/diskbbq/ES920DiskBBQVectorsReader.java @@ -19,7 +19,6 @@ import org.apache.lucene.util.VectorUtil; import org.elasticsearch.index.codec.vectors.OptimizedScalarQuantizer; import org.elasticsearch.index.codec.vectors.cluster.NeighborQueue; -import org.elasticsearch.index.codec.vectors.reflect.OffHeapStats; import org.elasticsearch.simdvec.ES91OSQVectorsScorer; import org.elasticsearch.simdvec.ES92Int7VectorsScorer; import org.elasticsearch.simdvec.ESVectorUtil; @@ -38,7 +37,7 @@ * Default implementation of {@link IVFVectorsReader}. It scores the posting lists centroids using * brute force and then scores the top ones using the posting list. */ -public class ES920DiskBBQVectorsReader extends IVFVectorsReader implements OffHeapStats { +public class ES920DiskBBQVectorsReader extends IVFVectorsReader { public ES920DiskBBQVectorsReader(SegmentReadState state, Map rawVectorsReader) throws IOException { super(state, rawVectorsReader); diff --git a/server/src/main/java/org/elasticsearch/index/codec/vectors/diskbbq/IVFVectorsReader.java b/server/src/main/java/org/elasticsearch/index/codec/vectors/diskbbq/IVFVectorsReader.java index 8d7ac579451ad..91e28c8bf473b 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/vectors/diskbbq/IVFVectorsReader.java +++ b/server/src/main/java/org/elasticsearch/index/codec/vectors/diskbbq/IVFVectorsReader.java @@ -22,12 +22,12 @@ import org.apache.lucene.index.VectorEncoding; import org.apache.lucene.index.VectorSimilarityFunction; import org.apache.lucene.internal.hppc.IntObjectHashMap; +import org.apache.lucene.search.AcceptDocs; import org.apache.lucene.search.KnnCollector; import org.apache.lucene.store.ChecksumIndexInput; import org.apache.lucene.store.DataInput; import org.apache.lucene.store.IOContext; import org.apache.lucene.store.IndexInput; -import org.apache.lucene.util.BitSet; import org.apache.lucene.util.Bits; import org.elasticsearch.core.IOUtils; import org.elasticsearch.search.vectors.IVFKnnSearchStrategy; @@ -255,7 +255,7 @@ public final ByteVectorValues getByteVectorValues(String field) throws IOExcepti } @Override - public final void search(String field, float[] target, KnnCollector knnCollector, Bits acceptDocs) throws IOException { + public final void search(String field, float[] target, KnnCollector knnCollector, AcceptDocs acceptDocs) throws IOException { final FieldInfo fieldInfo = state.fieldInfos.fieldInfo(field); if (fieldInfo.getVectorEncoding().equals(VectorEncoding.FLOAT32) == false) { getReaderForField(field).search(field, target, knnCollector, acceptDocs); @@ -266,11 +266,8 @@ public final void search(String field, float[] target, KnnCollector knnCollector "vector query dimension: " + target.length + " differs from field dimension: " + fieldInfo.getVectorDimension() ); } - float percentFiltered = 1f; - if (acceptDocs instanceof BitSet bitSet) { - percentFiltered = Math.max(0f, Math.min(1f, (float) bitSet.approximateCardinality() / bitSet.length())); - } int numVectors = getReaderForField(field).getFloatVectorValues(field).size(); + float percentFiltered = Math.max(0f, Math.min(1f, (float) acceptDocs.cost() / numVectors)); float visitRatio = DYNAMIC_VISIT_RATIO; // Search strategy may be null if this is being called from checkIndex (e.g. from a test) if (knnCollector.getSearchStrategy() instanceof IVFKnnSearchStrategy ivfSearchStrategy) { @@ -299,7 +296,8 @@ public final void search(String field, float[] target, KnnCollector knnCollector postListSlice, visitRatio ); - PostingVisitor scorer = getPostingVisitor(fieldInfo, postListSlice, target, acceptDocs); + Bits acceptDocsBits = acceptDocs.bits(); + PostingVisitor scorer = getPostingVisitor(fieldInfo, postListSlice, target, acceptDocsBits); long expectedDocs = 0; long actualDocs = 0; // initially we visit only the "centroids to search" @@ -318,7 +316,7 @@ public final void search(String field, float[] target, KnnCollector knnCollector knnCollector.getSearchStrategy().nextVectorsBlock(); } } - if (acceptDocs != null) { + if (acceptDocsBits != null) { float unfilteredRatioVisited = (float) expectedDocs / numVectors; int filteredVectors = (int) Math.ceil(numVectors * percentFiltered); float expectedScored = Math.min(2 * filteredVectors * unfilteredRatioVisited, expectedDocs / 2f); @@ -334,7 +332,7 @@ public final void search(String field, float[] target, KnnCollector knnCollector } @Override - public final void search(String field, byte[] target, KnnCollector knnCollector, Bits acceptDocs) throws IOException { + public final void search(String field, byte[] target, KnnCollector knnCollector, AcceptDocs acceptDocs) throws IOException { final FieldInfo fieldInfo = state.fieldInfos.fieldInfo(field); final ByteVectorValues values = getReaderForField(field).getByteVectorValues(field); for (int i = 0; i < values.size(); i++) { @@ -346,6 +344,22 @@ public final void search(String field, byte[] target, KnnCollector knnCollector, } } + @Override + public Map getOffHeapByteSize(FieldInfo fieldInfo) { + var raw = getReaderForField(fieldInfo.name).getOffHeapByteSize(fieldInfo); + FieldEntry fe = fields.get(fieldInfo.number); + if (fe == null) { + assert fieldInfo.getVectorEncoding() == VectorEncoding.BYTE; + return raw; + } + return raw; // for now just return the size of raw + + // TODO: determine desired off off-heap requirements + // var centroids = Map.of(EXTENSION, fe.xxxLength()); + // var clusters = Map.of(EXTENSION, fe.yyyLength()); + // return KnnVectorsReader.mergeOffHeapByteSizeMaps(raw, centroids, clusters); + } + @Override public void close() throws IOException { List closeables = new ArrayList<>(rawVectorReaders.values()); diff --git a/server/src/main/java/org/elasticsearch/index/codec/vectors/diskbbq/IVFVectorsWriter.java b/server/src/main/java/org/elasticsearch/index/codec/vectors/diskbbq/IVFVectorsWriter.java index 0f5988b2cd48c..e1c02400019d9 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/vectors/diskbbq/IVFVectorsWriter.java +++ b/server/src/main/java/org/elasticsearch/index/codec/vectors/diskbbq/IVFVectorsWriter.java @@ -24,6 +24,7 @@ import org.apache.lucene.index.VectorEncoding; import org.apache.lucene.index.VectorSimilarityFunction; import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.store.DataAccessHint; import org.apache.lucene.store.IOContext; import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.IndexOutput; @@ -313,8 +314,13 @@ private void mergeOneFieldIVF(FieldInfo fieldInfo, MergeState mergeState) throws // Even when the file might be sample, the reads will be always in increase order, therefore we set the ReadAdvice to SEQUENTIAL // so the OS can optimize read ahead in low memory situations. try ( - IndexInput vectors = mergeState.segmentInfo.dir.openInput(tempRawVectorsFileName, IOContext.READONCE); - IndexInput docs = docsFileName == null ? null : mergeState.segmentInfo.dir.openInput(docsFileName, IOContext.READONCE) + IndexInput vectors = mergeState.segmentInfo.dir.openInput( + tempRawVectorsFileName, + IOContext.DEFAULT.withHints(DataAccessHint.SEQUENTIAL) + ); + IndexInput docs = docsFileName == null + ? null + : mergeState.segmentInfo.dir.openInput(docsFileName, IOContext.DEFAULT.withHints(DataAccessHint.SEQUENTIAL)) ) { final FloatVectorValues floatVectorValues = getFloatVectorValues(fieldInfo, docs, vectors, numVectors); diff --git a/server/src/main/java/org/elasticsearch/index/codec/vectors/es816/ES816BinaryQuantizedVectorsReader.java b/server/src/main/java/org/elasticsearch/index/codec/vectors/es816/ES816BinaryQuantizedVectorsReader.java index 73d4a2f02aec1..9dde74bad43ed 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/vectors/es816/ES816BinaryQuantizedVectorsReader.java +++ b/server/src/main/java/org/elasticsearch/index/codec/vectors/es816/ES816BinaryQuantizedVectorsReader.java @@ -20,6 +20,7 @@ package org.elasticsearch.index.codec.vectors.es816; import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.codecs.KnnVectorsReader; import org.apache.lucene.codecs.hnsw.FlatVectorsReader; import org.apache.lucene.codecs.lucene95.OrdToDocDISIReaderConfiguration; import org.apache.lucene.index.ByteVectorValues; @@ -31,12 +32,15 @@ import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.VectorEncoding; import org.apache.lucene.index.VectorSimilarityFunction; +import org.apache.lucene.search.AcceptDocs; import org.apache.lucene.search.KnnCollector; import org.apache.lucene.search.VectorScorer; import org.apache.lucene.store.ChecksumIndexInput; +import org.apache.lucene.store.DataAccessHint; +import org.apache.lucene.store.FileDataHint; +import org.apache.lucene.store.FileTypeHint; import org.apache.lucene.store.IOContext; import org.apache.lucene.store.IndexInput; -import org.apache.lucene.store.ReadAdvice; import org.apache.lucene.util.Bits; import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.RamUsageEstimator; @@ -44,13 +48,10 @@ import org.apache.lucene.util.hnsw.OrdinalTranslatedKnnCollector; import org.apache.lucene.util.hnsw.RandomVectorScorer; import org.elasticsearch.index.codec.vectors.BQVectorUtils; -import org.elasticsearch.index.codec.vectors.reflect.OffHeapByteSizeUtils; -import org.elasticsearch.index.codec.vectors.reflect.OffHeapStats; import java.io.IOException; import java.util.HashMap; import java.util.Map; -import java.util.Objects; import static org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsReader.readSimilarityFunction; import static org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsReader.readVectorEncoding; @@ -60,7 +61,7 @@ * Copied from Lucene, replace with Lucene's implementation sometime after Lucene 10 */ @SuppressForbidden(reason = "Lucene classes") -public class ES816BinaryQuantizedVectorsReader extends FlatVectorsReader implements OffHeapStats { +public class ES816BinaryQuantizedVectorsReader extends FlatVectorsReader { private static final long SHALLOW_SIZE = RamUsageEstimator.shallowSizeOfInstance(ES816BinaryQuantizedVectorsReader.class); @@ -109,7 +110,7 @@ public class ES816BinaryQuantizedVectorsReader extends FlatVectorsReader impleme ES816BinaryQuantizedVectorsFormat.VECTOR_DATA_CODEC_NAME, // Quantized vectors are accessed randomly from their node ID stored in the HNSW // graph. - state.context.withReadAdvice(ReadAdvice.RANDOM) + state.context.withHints(FileTypeHint.DATA, FileDataHint.KNN_VECTORS, DataAccessHint.RANDOM) ); success = true; } finally { @@ -226,17 +227,17 @@ public ByteVectorValues getByteVectorValues(String field) throws IOException { } @Override - public void search(String field, byte[] target, KnnCollector knnCollector, Bits acceptDocs) throws IOException { + public void search(String field, byte[] target, KnnCollector knnCollector, AcceptDocs acceptDocs) throws IOException { rawVectorsReader.search(field, target, knnCollector, acceptDocs); } @Override - public void search(String field, float[] target, KnnCollector knnCollector, Bits acceptDocs) throws IOException { + public void search(String field, float[] target, KnnCollector knnCollector, AcceptDocs acceptDocs) throws IOException { if (knnCollector.k() == 0) return; final RandomVectorScorer scorer = getRandomVectorScorer(field, target); if (scorer == null) return; OrdinalTranslatedKnnCollector collector = new OrdinalTranslatedKnnCollector(knnCollector, scorer::ordToDoc); - Bits acceptedOrds = scorer.getAcceptOrds(acceptDocs); + Bits acceptedOrds = scorer.getAcceptOrds(acceptDocs.bits()); for (int i = 0; i < scorer.maxOrd(); i++) { if (acceptedOrds == null || acceptedOrds.get(i)) { collector.collect(i, scorer.score(i)); @@ -260,15 +261,14 @@ public long ramBytesUsed() { @Override public Map getOffHeapByteSize(FieldInfo fieldInfo) { - Objects.requireNonNull(fieldInfo); - var raw = OffHeapByteSizeUtils.getOffHeapByteSize(rawVectorsReader, fieldInfo); - var fieldEntry = fields.get(fieldInfo.name); - if (fieldEntry == null) { + var raw = rawVectorsReader.getOffHeapByteSize(fieldInfo); + FieldEntry fe = fields.get(fieldInfo.name); + if (fe == null) { assert fieldInfo.getVectorEncoding() == VectorEncoding.BYTE; return raw; } - var quant = Map.of(VECTOR_DATA_EXTENSION, fieldEntry.vectorDataLength()); - return OffHeapByteSizeUtils.mergeOffHeapByteSizeMaps(raw, quant); + var quant = Map.of(VECTOR_DATA_EXTENSION, fe.vectorDataLength()); + return KnnVectorsReader.mergeOffHeapByteSizeMaps(raw, quant); } public float[] getCentroid(String field) { diff --git a/server/src/main/java/org/elasticsearch/index/codec/vectors/es818/DirectIOIndexInputSupplier.java b/server/src/main/java/org/elasticsearch/index/codec/vectors/es818/DirectIOHint.java similarity index 59% rename from server/src/main/java/org/elasticsearch/index/codec/vectors/es818/DirectIOIndexInputSupplier.java rename to server/src/main/java/org/elasticsearch/index/codec/vectors/es818/DirectIOHint.java index 0640a5dacce65..73b7182911114 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/vectors/es818/DirectIOIndexInputSupplier.java +++ b/server/src/main/java/org/elasticsearch/index/codec/vectors/es818/DirectIOHint.java @@ -10,14 +10,7 @@ package org.elasticsearch.index.codec.vectors.es818; import org.apache.lucene.store.IOContext; -import org.apache.lucene.store.IndexInput; -import java.io.IOException; - -/** - * A hook for {@link DirectIOLucene99FlatVectorsReader} to specify the input should be opened using DirectIO. - * Remove when IOContext allows more extensible payloads to be specified. - */ -public interface DirectIOIndexInputSupplier { - IndexInput openInputDirect(String name, IOContext context) throws IOException; +public enum DirectIOHint implements IOContext.FileOpenHint { + INSTANCE } diff --git a/server/src/main/java/org/elasticsearch/index/codec/vectors/es818/DirectIOLucene99FlatVectorsFormat.java b/server/src/main/java/org/elasticsearch/index/codec/vectors/es818/DirectIOLucene99FlatVectorsFormat.java index c68987902e995..620b9ec6637b6 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/vectors/es818/DirectIOLucene99FlatVectorsFormat.java +++ b/server/src/main/java/org/elasticsearch/index/codec/vectors/es818/DirectIOLucene99FlatVectorsFormat.java @@ -26,13 +26,16 @@ import org.apache.lucene.codecs.lucene99.Lucene99FlatVectorsWriter; import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SegmentWriteState; -import org.apache.lucene.store.FilterDirectory; +import org.apache.lucene.store.FlushInfo; import org.apache.lucene.store.IOContext; +import org.apache.lucene.store.MergeInfo; +import org.elasticsearch.common.util.set.Sets; import org.elasticsearch.index.codec.vectors.AbstractFlatVectorsFormat; import org.elasticsearch.index.codec.vectors.MergeReaderWrapper; import org.elasticsearch.index.store.FsDirectoryFactory; import java.io.IOException; +import java.util.Set; /** * Copied from Lucene99FlatVectorsFormat in Lucene 10.1 @@ -43,10 +46,6 @@ public class DirectIOLucene99FlatVectorsFormat extends AbstractFlatVectorsFormat { static final String NAME = "Lucene99FlatVectorsFormat"; - static final String META_CODEC_NAME = "Lucene99FlatVectorsFormatMeta"; - static final String VECTOR_DATA_CODEC_NAME = "Lucene99FlatVectorsFormatData"; - static final String META_EXTENSION = "vemf"; - static final String VECTOR_DATA_EXTENSION = "vec"; public static final int VERSION_START = 0; public static final int VERSION_CURRENT = VERSION_START; @@ -71,21 +70,63 @@ public FlatVectorsWriter fieldsWriter(SegmentWriteState state) throws IOExceptio static boolean shouldUseDirectIO(SegmentReadState state) { assert USE_DIRECT_IO; - return FsDirectoryFactory.isHybridFs(state.directory) - && FilterDirectory.unwrap(state.directory) instanceof DirectIOIndexInputSupplier; + return FsDirectoryFactory.isHybridFs(state.directory); } @Override public FlatVectorsReader fieldsReader(SegmentReadState state) throws IOException { if (shouldUseDirectIO(state) && state.context.context() == IOContext.Context.DEFAULT) { + // only override the context for the random-access use case + SegmentReadState directIOState = new SegmentReadState( + state.directory, + state.segmentInfo, + state.fieldInfos, + new DirectIOContext(state.context.hints()), + state.segmentSuffix + ); // Use mmap for merges and direct I/O for searches. // TODO: Open the mmap file with sequential access instead of random (current behavior). return new MergeReaderWrapper( - new DirectIOLucene99FlatVectorsReader(state, vectorsScorer), + new Lucene99FlatVectorsReader(directIOState, vectorsScorer), new Lucene99FlatVectorsReader(state, vectorsScorer) ); } else { return new Lucene99FlatVectorsReader(state, vectorsScorer); } } + + static class DirectIOContext implements IOContext { + + final Set hints; + + DirectIOContext(Set hints) { + // always add DirectIOHint to the hints given + this.hints = Sets.union(hints, Set.of(DirectIOHint.INSTANCE)); + } + + @Override + public Context context() { + return Context.DEFAULT; + } + + @Override + public MergeInfo mergeInfo() { + return null; + } + + @Override + public FlushInfo flushInfo() { + return null; + } + + @Override + public Set hints() { + return hints; + } + + @Override + public IOContext withHints(FileOpenHint... hints) { + return new DirectIOContext(Set.of(hints)); + } + } } diff --git a/server/src/main/java/org/elasticsearch/index/codec/vectors/es818/DirectIOLucene99FlatVectorsReader.java b/server/src/main/java/org/elasticsearch/index/codec/vectors/es818/DirectIOLucene99FlatVectorsReader.java deleted file mode 100644 index f304fe3aa4e88..0000000000000 --- a/server/src/main/java/org/elasticsearch/index/codec/vectors/es818/DirectIOLucene99FlatVectorsReader.java +++ /dev/null @@ -1,350 +0,0 @@ -/* - * @notice - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - * Modifications copyright (C) 2025 Elasticsearch B.V. - */ -package org.elasticsearch.index.codec.vectors.es818; - -import org.apache.lucene.codecs.CodecUtil; -import org.apache.lucene.codecs.hnsw.FlatVectorsReader; -import org.apache.lucene.codecs.hnsw.FlatVectorsScorer; -import org.apache.lucene.codecs.lucene95.OffHeapByteVectorValues; -import org.apache.lucene.codecs.lucene95.OffHeapFloatVectorValues; -import org.apache.lucene.codecs.lucene95.OrdToDocDISIReaderConfiguration; -import org.apache.lucene.index.ByteVectorValues; -import org.apache.lucene.index.CorruptIndexException; -import org.apache.lucene.index.FieldInfo; -import org.apache.lucene.index.FieldInfos; -import org.apache.lucene.index.FloatVectorValues; -import org.apache.lucene.index.IndexFileNames; -import org.apache.lucene.index.SegmentReadState; -import org.apache.lucene.index.VectorEncoding; -import org.apache.lucene.index.VectorSimilarityFunction; -import org.apache.lucene.internal.hppc.IntObjectHashMap; -import org.apache.lucene.store.ChecksumIndexInput; -import org.apache.lucene.store.FilterDirectory; -import org.apache.lucene.store.IOContext; -import org.apache.lucene.store.IndexInput; -import org.apache.lucene.store.ReadAdvice; -import org.apache.lucene.util.IOUtils; -import org.apache.lucene.util.RamUsageEstimator; -import org.apache.lucene.util.SuppressForbidden; -import org.apache.lucene.util.hnsw.RandomVectorScorer; -import org.elasticsearch.index.codec.vectors.reflect.OffHeapStats; - -import java.io.IOException; -import java.io.UncheckedIOException; -import java.util.Map; - -import static org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsReader.readSimilarityFunction; -import static org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsReader.readVectorEncoding; - -/** Copied from Lucene99FlatVectorsReader in Lucene 10.2, then modified to support DirectIOIndexInputSupplier */ -@SuppressForbidden(reason = "Copied from lucene") -public class DirectIOLucene99FlatVectorsReader extends FlatVectorsReader implements OffHeapStats { - - private static final long SHALLOW_SIZE = RamUsageEstimator.shallowSizeOfInstance(DirectIOLucene99FlatVectorsReader.class); - - private final IntObjectHashMap fields = new IntObjectHashMap<>(); - private final IndexInput vectorData; - private final FieldInfos fieldInfos; - - @SuppressWarnings("this-escape") - public DirectIOLucene99FlatVectorsReader(SegmentReadState state, FlatVectorsScorer scorer) throws IOException { - super(scorer); - int versionMeta = readMetadata(state); - this.fieldInfos = state.fieldInfos; - boolean success = false; - try { - vectorData = openDataInput( - state, - versionMeta, - DirectIOLucene99FlatVectorsFormat.VECTOR_DATA_EXTENSION, - DirectIOLucene99FlatVectorsFormat.VECTOR_DATA_CODEC_NAME, - // Flat formats are used to randomly access vectors from their node ID that is stored - // in the HNSW graph. - state.context.withReadAdvice(ReadAdvice.RANDOM) - ); - success = true; - } finally { - if (success == false) { - IOUtils.closeWhileHandlingException(this); - } - } - } - - private int readMetadata(SegmentReadState state) throws IOException { - String metaFileName = IndexFileNames.segmentFileName( - state.segmentInfo.name, - state.segmentSuffix, - DirectIOLucene99FlatVectorsFormat.META_EXTENSION - ); - int versionMeta = -1; - try (ChecksumIndexInput meta = state.directory.openChecksumInput(metaFileName)) { - Throwable priorE = null; - try { - versionMeta = CodecUtil.checkIndexHeader( - meta, - DirectIOLucene99FlatVectorsFormat.META_CODEC_NAME, - DirectIOLucene99FlatVectorsFormat.VERSION_START, - DirectIOLucene99FlatVectorsFormat.VERSION_CURRENT, - state.segmentInfo.getId(), - state.segmentSuffix - ); - readFields(meta, state.fieldInfos); - } catch (Throwable exception) { - priorE = exception; - } finally { - CodecUtil.checkFooter(meta, priorE); - } - } - return versionMeta; - } - - private static IndexInput openDataInput( - SegmentReadState state, - int versionMeta, - String fileExtension, - String codecName, - IOContext context - ) throws IOException { - String fileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, fileExtension); - // use direct IO for accessing raw vector data for searches - assert ES818BinaryQuantizedVectorsFormat.USE_DIRECT_IO; - IndexInput in = FilterDirectory.unwrap(state.directory) instanceof DirectIOIndexInputSupplier did - ? did.openInputDirect(fileName, context) - : state.directory.openInput(fileName, context); - boolean success = false; - try { - int versionVectorData = CodecUtil.checkIndexHeader( - in, - codecName, - DirectIOLucene99FlatVectorsFormat.VERSION_START, - DirectIOLucene99FlatVectorsFormat.VERSION_CURRENT, - state.segmentInfo.getId(), - state.segmentSuffix - ); - if (versionMeta != versionVectorData) { - throw new CorruptIndexException( - "Format versions mismatch: meta=" + versionMeta + ", " + codecName + "=" + versionVectorData, - in - ); - } - CodecUtil.retrieveChecksum(in); - success = true; - return in; - } finally { - if (success == false) { - IOUtils.closeWhileHandlingException(in); - } - } - } - - private void readFields(ChecksumIndexInput meta, FieldInfos infos) throws IOException { - for (int fieldNumber = meta.readInt(); fieldNumber != -1; fieldNumber = meta.readInt()) { - FieldInfo info = infos.fieldInfo(fieldNumber); - if (info == null) { - throw new CorruptIndexException("Invalid field number: " + fieldNumber, meta); - } - FieldEntry fieldEntry = FieldEntry.create(meta, info); - fields.put(info.number, fieldEntry); - } - } - - @Override - public long ramBytesUsed() { - return SHALLOW_SIZE + fields.ramBytesUsed(); - } - - @Override - public void checkIntegrity() throws IOException { - CodecUtil.checksumEntireFile(vectorData); - } - - @Override - public FlatVectorsReader getMergeInstance() { - try { - // Update the read advice since vectors are guaranteed to be accessed sequentially for merge - this.vectorData.updateReadAdvice(ReadAdvice.SEQUENTIAL); - return this; - } catch (IOException exception) { - throw new UncheckedIOException(exception); - } - } - - private FieldEntry getFieldEntry(String field, VectorEncoding expectedEncoding) { - final FieldInfo info = fieldInfos.fieldInfo(field); - final FieldEntry fieldEntry; - if (info == null || (fieldEntry = fields.get(info.number)) == null) { - throw new IllegalArgumentException("field=\"" + field + "\" not found"); - } - if (fieldEntry.vectorEncoding != expectedEncoding) { - throw new IllegalArgumentException( - "field=\"" + field + "\" is encoded as: " + fieldEntry.vectorEncoding + " expected: " + expectedEncoding - ); - } - return fieldEntry; - } - - @Override - public FloatVectorValues getFloatVectorValues(String field) throws IOException { - final FieldEntry fieldEntry = getFieldEntry(field, VectorEncoding.FLOAT32); - return OffHeapFloatVectorValues.load( - fieldEntry.similarityFunction, - vectorScorer, - fieldEntry.ordToDoc, - fieldEntry.vectorEncoding, - fieldEntry.dimension, - fieldEntry.vectorDataOffset, - fieldEntry.vectorDataLength, - vectorData - ); - } - - @Override - public ByteVectorValues getByteVectorValues(String field) throws IOException { - final FieldEntry fieldEntry = getFieldEntry(field, VectorEncoding.BYTE); - return OffHeapByteVectorValues.load( - fieldEntry.similarityFunction, - vectorScorer, - fieldEntry.ordToDoc, - fieldEntry.vectorEncoding, - fieldEntry.dimension, - fieldEntry.vectorDataOffset, - fieldEntry.vectorDataLength, - vectorData - ); - } - - @Override - public RandomVectorScorer getRandomVectorScorer(String field, float[] target) throws IOException { - final FieldEntry fieldEntry = getFieldEntry(field, VectorEncoding.FLOAT32); - return vectorScorer.getRandomVectorScorer( - fieldEntry.similarityFunction, - OffHeapFloatVectorValues.load( - fieldEntry.similarityFunction, - vectorScorer, - fieldEntry.ordToDoc, - fieldEntry.vectorEncoding, - fieldEntry.dimension, - fieldEntry.vectorDataOffset, - fieldEntry.vectorDataLength, - vectorData - ), - target - ); - } - - @Override - public RandomVectorScorer getRandomVectorScorer(String field, byte[] target) throws IOException { - final FieldEntry fieldEntry = getFieldEntry(field, VectorEncoding.BYTE); - return vectorScorer.getRandomVectorScorer( - fieldEntry.similarityFunction, - OffHeapByteVectorValues.load( - fieldEntry.similarityFunction, - vectorScorer, - fieldEntry.ordToDoc, - fieldEntry.vectorEncoding, - fieldEntry.dimension, - fieldEntry.vectorDataOffset, - fieldEntry.vectorDataLength, - vectorData - ), - target - ); - } - - @Override - public void finishMerge() throws IOException { - // This makes sure that the access pattern hint is reverted back since HNSW implementation - // needs it - this.vectorData.updateReadAdvice(ReadAdvice.RANDOM); - } - - @Override - public void close() throws IOException { - IOUtils.close(vectorData); - } - - @Override - public Map getOffHeapByteSize(FieldInfo fieldInfo) { - return Map.of(); // no off-heap - } - - private record FieldEntry( - VectorSimilarityFunction similarityFunction, - VectorEncoding vectorEncoding, - long vectorDataOffset, - long vectorDataLength, - int dimension, - int size, - OrdToDocDISIReaderConfiguration ordToDoc, - FieldInfo info - ) { - - FieldEntry { - if (similarityFunction != info.getVectorSimilarityFunction()) { - throw new IllegalStateException( - "Inconsistent vector similarity function for field=\"" - + info.name - + "\"; " - + similarityFunction - + " != " - + info.getVectorSimilarityFunction() - ); - } - int infoVectorDimension = info.getVectorDimension(); - if (infoVectorDimension != dimension) { - throw new IllegalStateException( - "Inconsistent vector dimension for field=\"" + info.name + "\"; " + infoVectorDimension + " != " + dimension - ); - } - - int byteSize = switch (info.getVectorEncoding()) { - case BYTE -> Byte.BYTES; - case FLOAT32 -> Float.BYTES; - }; - long vectorBytes = Math.multiplyExact((long) infoVectorDimension, byteSize); - long numBytes = Math.multiplyExact(vectorBytes, size); - if (numBytes != vectorDataLength) { - throw new IllegalStateException( - "Vector data length " - + vectorDataLength - + " not matching size=" - + size - + " * dim=" - + dimension - + " * byteSize=" - + byteSize - + " = " - + numBytes - ); - } - } - - static FieldEntry create(IndexInput input, FieldInfo info) throws IOException { - final VectorEncoding vectorEncoding = readVectorEncoding(input); - final VectorSimilarityFunction similarityFunction = readSimilarityFunction(input); - final var vectorDataOffset = input.readVLong(); - final var vectorDataLength = input.readVLong(); - final var dimension = input.readVInt(); - final var size = input.readInt(); - final var ordToDoc = OrdToDocDISIReaderConfiguration.fromStoredMeta(input, size); - return new FieldEntry(similarityFunction, vectorEncoding, vectorDataOffset, vectorDataLength, dimension, size, ordToDoc, info); - } - } -} diff --git a/server/src/main/java/org/elasticsearch/index/codec/vectors/es818/ES818BinaryQuantizedVectorsReader.java b/server/src/main/java/org/elasticsearch/index/codec/vectors/es818/ES818BinaryQuantizedVectorsReader.java index 6de775c4773b5..1082faaca4256 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/vectors/es818/ES818BinaryQuantizedVectorsReader.java +++ b/server/src/main/java/org/elasticsearch/index/codec/vectors/es818/ES818BinaryQuantizedVectorsReader.java @@ -20,6 +20,7 @@ package org.elasticsearch.index.codec.vectors.es818; import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.codecs.KnnVectorsReader; import org.apache.lucene.codecs.hnsw.FlatVectorsReader; import org.apache.lucene.codecs.lucene95.OrdToDocDISIReaderConfiguration; import org.apache.lucene.index.ByteVectorValues; @@ -31,12 +32,15 @@ import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.VectorEncoding; import org.apache.lucene.index.VectorSimilarityFunction; +import org.apache.lucene.search.AcceptDocs; import org.apache.lucene.search.KnnCollector; import org.apache.lucene.search.VectorScorer; import org.apache.lucene.store.ChecksumIndexInput; +import org.apache.lucene.store.DataAccessHint; +import org.apache.lucene.store.FileDataHint; +import org.apache.lucene.store.FileTypeHint; import org.apache.lucene.store.IOContext; import org.apache.lucene.store.IndexInput; -import org.apache.lucene.store.ReadAdvice; import org.apache.lucene.util.Bits; import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.RamUsageEstimator; @@ -45,13 +49,10 @@ import org.apache.lucene.util.hnsw.RandomVectorScorer; import org.elasticsearch.index.codec.vectors.BQVectorUtils; import org.elasticsearch.index.codec.vectors.OptimizedScalarQuantizer; -import org.elasticsearch.index.codec.vectors.reflect.OffHeapByteSizeUtils; -import org.elasticsearch.index.codec.vectors.reflect.OffHeapStats; import java.io.IOException; import java.util.HashMap; import java.util.Map; -import java.util.Objects; import static org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsReader.readSimilarityFunction; import static org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsReader.readVectorEncoding; @@ -61,7 +62,7 @@ * Copied from Lucene, replace with Lucene's implementation sometime after Lucene 10 */ @SuppressForbidden(reason = "Lucene classes") -public class ES818BinaryQuantizedVectorsReader extends FlatVectorsReader implements OffHeapStats { +public class ES818BinaryQuantizedVectorsReader extends FlatVectorsReader { private static final long SHALLOW_SIZE = RamUsageEstimator.shallowSizeOfInstance(ES818BinaryQuantizedVectorsReader.class); @@ -111,7 +112,7 @@ public class ES818BinaryQuantizedVectorsReader extends FlatVectorsReader impleme ES818BinaryQuantizedVectorsFormat.VECTOR_DATA_CODEC_NAME, // Quantized vectors are accessed randomly from their node ID stored in the HNSW // graph. - state.context.withReadAdvice(ReadAdvice.RANDOM) + state.context.withHints(FileTypeHint.DATA, FileDataHint.KNN_VECTORS, DataAccessHint.RANDOM) ); success = true; } finally { @@ -129,13 +130,8 @@ private ES818BinaryQuantizedVectorsReader(ES818BinaryQuantizedVectorsReader clon this.fields = clone.fields; } - // For testing - FlatVectorsReader getRawVectorsReader() { - return rawVectorsReader; - } - @Override - public FlatVectorsReader getMergeInstance() { + public FlatVectorsReader getMergeInstance() throws IOException { return new ES818BinaryQuantizedVectorsReader(this, rawVectorsReader.getMergeInstance()); } @@ -245,17 +241,17 @@ public ByteVectorValues getByteVectorValues(String field) throws IOException { } @Override - public void search(String field, byte[] target, KnnCollector knnCollector, Bits acceptDocs) throws IOException { + public void search(String field, byte[] target, KnnCollector knnCollector, AcceptDocs acceptDocs) throws IOException { rawVectorsReader.search(field, target, knnCollector, acceptDocs); } @Override - public void search(String field, float[] target, KnnCollector knnCollector, Bits acceptDocs) throws IOException { + public void search(String field, float[] target, KnnCollector knnCollector, AcceptDocs acceptDocs) throws IOException { if (knnCollector.k() == 0) return; final RandomVectorScorer scorer = getRandomVectorScorer(field, target); if (scorer == null) return; OrdinalTranslatedKnnCollector collector = new OrdinalTranslatedKnnCollector(knnCollector, scorer::ordToDoc); - Bits acceptedOrds = scorer.getAcceptOrds(acceptDocs); + Bits acceptedOrds = scorer.getAcceptOrds(acceptDocs.bits()); for (int i = 0; i < scorer.maxOrd(); i++) { if (acceptedOrds == null || acceptedOrds.get(i)) { collector.collect(i, scorer.score(i)); @@ -279,15 +275,14 @@ public long ramBytesUsed() { @Override public Map getOffHeapByteSize(FieldInfo fieldInfo) { - Objects.requireNonNull(fieldInfo); - var raw = OffHeapByteSizeUtils.getOffHeapByteSize(rawVectorsReader, fieldInfo); - var fieldEntry = fields.get(fieldInfo.name); - if (fieldEntry == null) { + var raw = rawVectorsReader.getOffHeapByteSize(fieldInfo); + FieldEntry fe = fields.get(fieldInfo.name); + if (fe == null) { assert fieldInfo.getVectorEncoding() == VectorEncoding.BYTE; return raw; } - var quant = Map.of(VECTOR_DATA_EXTENSION, fieldEntry.vectorDataLength()); - return OffHeapByteSizeUtils.mergeOffHeapByteSizeMaps(raw, quant); + var quant = Map.of(VECTOR_DATA_EXTENSION, fe.vectorDataLength()); + return KnnVectorsReader.mergeOffHeapByteSizeMaps(raw, quant); } public float[] getCentroid(String field) { diff --git a/server/src/main/java/org/elasticsearch/index/codec/vectors/reflect/AssertingKnnVectorsReaderReflect.java b/server/src/main/java/org/elasticsearch/index/codec/vectors/reflect/AssertingKnnVectorsReaderReflect.java deleted file mode 100644 index b22fa88fb49b4..0000000000000 --- a/server/src/main/java/org/elasticsearch/index/codec/vectors/reflect/AssertingKnnVectorsReaderReflect.java +++ /dev/null @@ -1,69 +0,0 @@ -/* - * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one - * or more contributor license agreements. Licensed under the "Elastic License - * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side - * Public License v 1"; you may not use this file except in compliance with, at - * your election, the "Elastic License 2.0", the "GNU Affero General Public - * License v3.0 only", or the "Server Side Public License, v 1". - */ - -package org.elasticsearch.index.codec.vectors.reflect; - -import org.apache.lucene.codecs.KnnVectorsReader; -import org.elasticsearch.core.SuppressForbidden; - -import java.lang.invoke.MethodHandle; -import java.lang.invoke.MethodHandles; - -/** - * Reflective access to unwrap non-accessible delegate in AssertingKnnVectorsReader. - * Remove once KnnVectorsReaders::getOffHeapByteSize is available. - */ -public class AssertingKnnVectorsReaderReflect { - - @SuppressForbidden(reason = "static type is not accessible") - public static KnnVectorsReader unwrapAssertingReader(KnnVectorsReader reader) { - try { - if (ASSERTING_ASSERT_KNN_READER_CLS != null && ASSERTING_ASSERT_KNN_READER_CLS.isAssignableFrom(reader.getClass())) { - return (KnnVectorsReader) GET_VECTOR_INDEX_LENGTH_HANDLE.invoke(reader); - } - } catch (Throwable t) { - handleThrowable(t); - } - return reader; - } - - private static final Class ASSERTING_ASSERT_KNN_READER_CLS = getAssertingReaderOrNull(); - private static final MethodHandle GET_VECTOR_INDEX_LENGTH_HANDLE = getDelegateFieldHandle(); - - private static Class getAssertingReaderOrNull() { - try { - return Class.forName("org.apache.lucene.tests.codecs.asserting.AssertingKnnVectorsFormat$AssertingKnnVectorsReader"); - } catch (ClassNotFoundException e) { - return null; - } - } - - private static MethodHandle getDelegateFieldHandle() { - try { - var cls = getAssertingReaderOrNull(); - if (cls == null) { - return MethodHandles.throwException(KnnVectorsReader.class, AssertionError.class); - } - var lookup = MethodHandles.privateLookupIn(cls, MethodHandles.lookup()); - return lookup.findGetter(cls, "delegate", KnnVectorsReader.class); - } catch (ReflectiveOperationException e) { - throw new AssertionError(e); - } - } - - static void handleThrowable(Throwable t) { - if (t instanceof Error error) { - throw error; - } else if (t instanceof RuntimeException runtimeException) { - throw runtimeException; - } else { - throw new AssertionError(t); - } - } -} diff --git a/server/src/main/java/org/elasticsearch/index/codec/vectors/reflect/OffHeapByteSizeUtils.java b/server/src/main/java/org/elasticsearch/index/codec/vectors/reflect/OffHeapByteSizeUtils.java deleted file mode 100644 index 81cea18a3d560..0000000000000 --- a/server/src/main/java/org/elasticsearch/index/codec/vectors/reflect/OffHeapByteSizeUtils.java +++ /dev/null @@ -1,88 +0,0 @@ -/* - * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one - * or more contributor license agreements. Licensed under the "Elastic License - * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side - * Public License v 1"; you may not use this file except in compliance with, at - * your election, the "Elastic License 2.0", the "GNU Affero General Public - * License v3.0 only", or the "Server Side Public License, v 1". - */ - -package org.elasticsearch.index.codec.vectors.reflect; - -import org.apache.lucene.backward_codecs.lucene90.Lucene90HnswVectorsReader; -import org.apache.lucene.backward_codecs.lucene91.Lucene91HnswVectorsReader; -import org.apache.lucene.backward_codecs.lucene92.Lucene92HnswVectorsReader; -import org.apache.lucene.backward_codecs.lucene94.Lucene94HnswVectorsReader; -import org.apache.lucene.backward_codecs.lucene95.Lucene95HnswVectorsReader; -import org.apache.lucene.codecs.KnnVectorsReader; -import org.apache.lucene.codecs.lucene99.Lucene99FlatVectorsReader; -import org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsReader; -import org.apache.lucene.codecs.lucene99.Lucene99ScalarQuantizedVectorsReader; -import org.apache.lucene.index.FieldInfo; - -import java.util.Map; -import java.util.stream.Collectors; -import java.util.stream.Stream; - -/** - * Static utility methods to help retrieve desired off-heap vector index size. - * Remove once KnnVectorsReaders::getOffHeapByteSize is available. - */ -public class OffHeapByteSizeUtils { - - private OffHeapByteSizeUtils() {} // no instances - - public static Map getOffHeapByteSize(KnnVectorsReader reader, FieldInfo fieldInfo) { - reader = AssertingKnnVectorsReaderReflect.unwrapAssertingReader(reader); - switch (reader) { - case OffHeapStats offHeapStats -> { - return offHeapStats.getOffHeapByteSize(fieldInfo); - } - case Lucene99HnswVectorsReader hnswVectorsReader -> { - var graph = OffHeapReflectionUtils.getOffHeapByteSizeL99HNSW(hnswVectorsReader, fieldInfo); - var flat = getOffHeapByteSize(OffHeapReflectionUtils.getFlatVectorsReaderL99HNSW(hnswVectorsReader), fieldInfo); - return mergeOffHeapByteSizeMaps(graph, flat); - } - case Lucene99ScalarQuantizedVectorsReader scalarQuantizedVectorsReader -> { - var quant = OffHeapReflectionUtils.getOffHeapByteSizeSQ(scalarQuantizedVectorsReader, fieldInfo); - var raw = getOffHeapByteSize(OffHeapReflectionUtils.getFlatVectorsReaderSQ(scalarQuantizedVectorsReader), fieldInfo); - return mergeOffHeapByteSizeMaps(quant, raw); - } - case Lucene99FlatVectorsReader flatVectorsReader -> { - return OffHeapReflectionUtils.getOffHeapByteSizeF99FLT(flatVectorsReader, fieldInfo); - } - case Lucene95HnswVectorsReader lucene95HnswVectorsReader -> { - return OffHeapReflectionUtils.getOffHeapByteSizeL95HNSW(lucene95HnswVectorsReader, fieldInfo); - } - case Lucene94HnswVectorsReader lucene94HnswVectorsReader -> { - return OffHeapReflectionUtils.getOffHeapByteSizeL94HNSW(lucene94HnswVectorsReader, fieldInfo); - } - case Lucene92HnswVectorsReader lucene92HnswVectorsReader -> { - return OffHeapReflectionUtils.getOffHeapByteSizeL92HNSW(lucene92HnswVectorsReader, fieldInfo); - } - case Lucene91HnswVectorsReader lucene91HnswVectorsReader -> { - return OffHeapReflectionUtils.getOffHeapByteSizeL91HNSW(lucene91HnswVectorsReader, fieldInfo); - } - case Lucene90HnswVectorsReader lucene90HnswVectorsReader -> { - return OffHeapReflectionUtils.getOffHeapByteSizeL90HNSW(lucene90HnswVectorsReader, fieldInfo); - } - case null, default -> { - assert false : "unexpected reader:" + reader; - } - } - return Map.of(); - } - - /** - * Merges the Maps returned by getOffHeapByteSize(FieldInfo). - * - *

This method is a convenience for aggregating the desired off-heap memory requirements for - * several fields. The keys in the returned map are a union of the keys in the given maps. Entries - * with the same key are summed. - */ - public static Map mergeOffHeapByteSizeMaps(Map map1, Map map2) { - return Stream.of(map1, map2) - .flatMap(map -> map.entrySet().stream()) - .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue, Long::sum)); - } -} diff --git a/server/src/main/java/org/elasticsearch/index/codec/vectors/reflect/OffHeapReflectionUtils.java b/server/src/main/java/org/elasticsearch/index/codec/vectors/reflect/OffHeapReflectionUtils.java deleted file mode 100644 index 69ca09bddab42..0000000000000 --- a/server/src/main/java/org/elasticsearch/index/codec/vectors/reflect/OffHeapReflectionUtils.java +++ /dev/null @@ -1,266 +0,0 @@ -/* - * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one - * or more contributor license agreements. Licensed under the "Elastic License - * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side - * Public License v 1"; you may not use this file except in compliance with, at - * your election, the "Elastic License 2.0", the "GNU Affero General Public - * License v3.0 only", or the "Server Side Public License, v 1". - */ - -package org.elasticsearch.index.codec.vectors.reflect; - -import org.apache.lucene.backward_codecs.lucene90.Lucene90HnswVectorsReader; -import org.apache.lucene.backward_codecs.lucene91.Lucene91HnswVectorsReader; -import org.apache.lucene.backward_codecs.lucene92.Lucene92HnswVectorsReader; -import org.apache.lucene.backward_codecs.lucene94.Lucene94HnswVectorsReader; -import org.apache.lucene.backward_codecs.lucene95.Lucene95HnswVectorsReader; -import org.apache.lucene.codecs.hnsw.FlatVectorsReader; -import org.apache.lucene.codecs.lucene99.Lucene99FlatVectorsReader; -import org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsReader; -import org.apache.lucene.codecs.lucene99.Lucene99ScalarQuantizedVectorsReader; -import org.apache.lucene.index.FieldInfo; -import org.apache.lucene.index.VectorEncoding; -import org.elasticsearch.core.SuppressForbidden; - -import java.lang.invoke.MethodHandle; -import java.lang.invoke.MethodHandles; -import java.lang.invoke.VarHandle; -import java.util.Map; - -import static java.lang.invoke.MethodType.methodType; - -/** - * Reflective access to non-accessible members of Lucene's KnnVectorsReader implementations. - * Remove once KnnVectorsReaders::getOffHeapByteSize is available. - */ -public class OffHeapReflectionUtils { - - private OffHeapReflectionUtils() {} - - static final String FLAT_VECTOR_DATA_EXTENSION = "vec"; - static final String SQ_VECTOR_INDEX_EXTENSION = "veq"; - static final String HNSW_VECTOR_INDEX_EXTENSION = "vex"; - - private static final MethodHandle GET_FIELD_ENTRY_HNDL_SQ; - private static final MethodHandle GET_VECTOR_DATA_LENGTH_HANDLE_SQ; - private static final VarHandle RAW_VECTORS_READER_HNDL_SQ; - private static final MethodHandle GET_FIELD_ENTRY_HANDLE_L99FLT; - private static final MethodHandle VECTOR_DATA_LENGTH_HANDLE_L99FLT; - private static final MethodHandle GET_FIELD_ENTRY_HANDLE_L99HNSW; - private static final MethodHandle GET_VECTOR_INDEX_LENGTH_HANDLE_L99HNSW; - private static final VarHandle FLAT_VECTORS_READER_HNDL_L99HNSW; - - static final Class L99_SQ_VR_CLS = Lucene99ScalarQuantizedVectorsReader.class; - static final Class L99_FLT_VR_CLS = Lucene99FlatVectorsReader.class; - static final Class L99_HNSW_VR_CLS = Lucene99HnswVectorsReader.class; - - // old codecs - private static final MethodHandle GET_FIELD_ENTRY_HANDLE_L90HNSW; - private static final MethodHandle GET_VECTOR_INDEX_LENGTH_HANDLE_L90HNSW; - private static final MethodHandle GET_VECTOR_DATA_LENGTH_HANDLE_L90HNSW; - - private static final MethodHandle GET_FIELD_ENTRY_HANDLE_L91HNSW; - private static final MethodHandle GET_VECTOR_INDEX_LENGTH_HANDLE_L91HNSW; - private static final MethodHandle GET_VECTOR_DATA_LENGTH_HANDLE_L91HNSW; - - private static final MethodHandle GET_FIELD_ENTRY_HANDLE_L92HNSW; - private static final MethodHandle GET_VECTOR_INDEX_LENGTH_HANDLE_L92HNSW; - private static final MethodHandle GET_VECTOR_DATA_LENGTH_HANDLE_L92HNSW; - - private static final MethodHandle GET_FIELD_ENTRY_HANDLE_L94HNSW; - private static final MethodHandle GET_VECTOR_INDEX_LENGTH_HANDLE_L94HNSW; - private static final MethodHandle GET_VECTOR_DATA_LENGTH_HANDLE_L94HNSW; - - private static final MethodHandle GET_FIELD_ENTRY_HANDLE_L95HNSW; - private static final MethodHandle GET_VECTOR_INDEX_LENGTH_HANDLE_L95HNSW; - private static final MethodHandle GET_VECTOR_DATA_LENGTH_HANDLE_L95HNSW; - - static final Class L90_HNSW_VR_CLS = Lucene90HnswVectorsReader.class; - static final Class L91_HNSW_VR_CLS = Lucene91HnswVectorsReader.class; - static final Class L92_HNSW_VR_CLS = Lucene92HnswVectorsReader.class; - static final Class L94_HNSW_VR_CLS = Lucene94HnswVectorsReader.class; - static final Class L95_HNSW_VR_CLS = Lucene95HnswVectorsReader.class; - - static { - try { - // Lucene99ScalarQuantizedVectorsReader - var cls = Class.forName("org.apache.lucene.codecs.lucene99.Lucene99ScalarQuantizedVectorsReader$FieldEntry"); - var lookup = MethodHandles.privateLookupIn(L99_SQ_VR_CLS, MethodHandles.lookup()); - var mt = methodType(cls, String.class); - GET_FIELD_ENTRY_HNDL_SQ = lookup.findVirtual(L99_SQ_VR_CLS, "getFieldEntry", mt); - GET_VECTOR_DATA_LENGTH_HANDLE_SQ = lookup.findVirtual(cls, "vectorDataLength", methodType(long.class)); - RAW_VECTORS_READER_HNDL_SQ = lookup.findVarHandle(L99_SQ_VR_CLS, "rawVectorsReader", FlatVectorsReader.class); - // Lucene99FlatVectorsReader - cls = Class.forName("org.apache.lucene.codecs.lucene99.Lucene99FlatVectorsReader$FieldEntry"); - lookup = MethodHandles.privateLookupIn(L99_FLT_VR_CLS, MethodHandles.lookup()); - mt = methodType(cls, String.class, VectorEncoding.class); - GET_FIELD_ENTRY_HANDLE_L99FLT = lookup.findVirtual(L99_FLT_VR_CLS, "getFieldEntry", mt); - VECTOR_DATA_LENGTH_HANDLE_L99FLT = lookup.findVirtual(cls, "vectorDataLength", methodType(long.class)); - // Lucene99HnswVectorsReader - cls = Class.forName("org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsReader$FieldEntry"); - lookup = MethodHandles.privateLookupIn(L99_HNSW_VR_CLS, MethodHandles.lookup()); - mt = methodType(cls, String.class, VectorEncoding.class); - GET_FIELD_ENTRY_HANDLE_L99HNSW = lookup.findVirtual(L99_HNSW_VR_CLS, "getFieldEntry", mt); - GET_VECTOR_INDEX_LENGTH_HANDLE_L99HNSW = lookup.findVirtual(cls, "vectorIndexLength", methodType(long.class)); - lookup = MethodHandles.privateLookupIn(L99_HNSW_VR_CLS, MethodHandles.lookup()); - FLAT_VECTORS_READER_HNDL_L99HNSW = lookup.findVarHandle(L99_HNSW_VR_CLS, "flatVectorsReader", FlatVectorsReader.class); - // Lucene90HnswVectorsReader - cls = Class.forName("org.apache.lucene.backward_codecs.lucene90.Lucene90HnswVectorsReader$FieldEntry"); - lookup = MethodHandles.privateLookupIn(L90_HNSW_VR_CLS, MethodHandles.lookup()); - mt = methodType(cls, String.class); - GET_FIELD_ENTRY_HANDLE_L90HNSW = lookup.findVirtual(L90_HNSW_VR_CLS, "getFieldEntry", mt); - GET_VECTOR_INDEX_LENGTH_HANDLE_L90HNSW = lookup.findVirtual(cls, "indexDataLength", methodType(long.class)); - GET_VECTOR_DATA_LENGTH_HANDLE_L90HNSW = lookup.findVirtual(cls, "vectorDataLength", methodType(long.class)); - // Lucene91HnswVectorsReader - cls = Class.forName("org.apache.lucene.backward_codecs.lucene91.Lucene91HnswVectorsReader$FieldEntry"); - lookup = MethodHandles.privateLookupIn(L91_HNSW_VR_CLS, MethodHandles.lookup()); - mt = methodType(cls, String.class); - GET_FIELD_ENTRY_HANDLE_L91HNSW = lookup.findVirtual(L91_HNSW_VR_CLS, "getFieldEntry", mt); - GET_VECTOR_INDEX_LENGTH_HANDLE_L91HNSW = lookup.findVirtual(cls, "vectorIndexLength", methodType(long.class)); - GET_VECTOR_DATA_LENGTH_HANDLE_L91HNSW = lookup.findVirtual(cls, "vectorDataLength", methodType(long.class)); - // Lucene92HnswVectorsReader - cls = Class.forName("org.apache.lucene.backward_codecs.lucene92.Lucene92HnswVectorsReader$FieldEntry"); - lookup = MethodHandles.privateLookupIn(L92_HNSW_VR_CLS, MethodHandles.lookup()); - mt = methodType(cls, String.class); - GET_FIELD_ENTRY_HANDLE_L92HNSW = lookup.findVirtual(L92_HNSW_VR_CLS, "getFieldEntry", mt); - GET_VECTOR_INDEX_LENGTH_HANDLE_L92HNSW = lookup.findVirtual(cls, "vectorIndexLength", methodType(long.class)); - GET_VECTOR_DATA_LENGTH_HANDLE_L92HNSW = lookup.findVirtual(cls, "vectorDataLength", methodType(long.class)); - // Lucene94HnswVectorsReader - cls = Class.forName("org.apache.lucene.backward_codecs.lucene94.Lucene94HnswVectorsReader$FieldEntry"); - lookup = MethodHandles.privateLookupIn(L94_HNSW_VR_CLS, MethodHandles.lookup()); - mt = methodType(cls, String.class, VectorEncoding.class); - GET_FIELD_ENTRY_HANDLE_L94HNSW = lookup.findVirtual(L94_HNSW_VR_CLS, "getFieldEntry", mt); - GET_VECTOR_INDEX_LENGTH_HANDLE_L94HNSW = lookup.findVirtual(cls, "vectorIndexLength", methodType(long.class)); - GET_VECTOR_DATA_LENGTH_HANDLE_L94HNSW = lookup.findVirtual(cls, "vectorDataLength", methodType(long.class)); - // Lucene95HnswVectorsReader - cls = Class.forName("org.apache.lucene.backward_codecs.lucene95.Lucene95HnswVectorsReader$FieldEntry"); - lookup = MethodHandles.privateLookupIn(L95_HNSW_VR_CLS, MethodHandles.lookup()); - mt = methodType(cls, String.class, VectorEncoding.class); - GET_FIELD_ENTRY_HANDLE_L95HNSW = lookup.findVirtual(L95_HNSW_VR_CLS, "getFieldEntry", mt); - GET_VECTOR_INDEX_LENGTH_HANDLE_L95HNSW = lookup.findVirtual(cls, "vectorIndexLength", methodType(long.class)); - GET_VECTOR_DATA_LENGTH_HANDLE_L95HNSW = lookup.findVirtual(cls, "vectorDataLength", methodType(long.class)); - } catch (ReflectiveOperationException e) { - throw new AssertionError(e); - } - } - - @SuppressForbidden(reason = "static type is not accessible") - static Map getOffHeapByteSizeSQ(Lucene99ScalarQuantizedVectorsReader reader, FieldInfo fieldInfo) { - try { - var entry = GET_FIELD_ENTRY_HNDL_SQ.invoke(reader, fieldInfo.name); - long len = (long) GET_VECTOR_DATA_LENGTH_HANDLE_SQ.invoke(entry); - return Map.of(SQ_VECTOR_INDEX_EXTENSION, len); - } catch (Throwable t) { - handleThrowable(t); - } - throw new AssertionError("should not reach here"); - } - - static FlatVectorsReader getFlatVectorsReaderSQ(Lucene99ScalarQuantizedVectorsReader reader) { - return (FlatVectorsReader) RAW_VECTORS_READER_HNDL_SQ.get(reader); - } - - @SuppressForbidden(reason = "static type is not accessible") - static Map getOffHeapByteSizeF99FLT(Lucene99FlatVectorsReader reader, FieldInfo fieldInfo) { - try { - var entry = GET_FIELD_ENTRY_HANDLE_L99FLT.invoke(reader, fieldInfo.name, fieldInfo.getVectorEncoding()); - long len = (long) VECTOR_DATA_LENGTH_HANDLE_L99FLT.invoke(entry); - return Map.of(FLAT_VECTOR_DATA_EXTENSION, len); - } catch (Throwable t) { - handleThrowable(t); - } - throw new AssertionError("should not reach here"); - } - - @SuppressForbidden(reason = "static type is not accessible") - static Map getOffHeapByteSizeL99HNSW(Lucene99HnswVectorsReader reader, FieldInfo fieldInfo) { - try { - var entry = GET_FIELD_ENTRY_HANDLE_L99HNSW.invoke(reader, fieldInfo.name, fieldInfo.getVectorEncoding()); - long len = (long) GET_VECTOR_INDEX_LENGTH_HANDLE_L99HNSW.invoke(entry); - return Map.of(HNSW_VECTOR_INDEX_EXTENSION, len); - } catch (Throwable t) { - handleThrowable(t); - } - throw new AssertionError("should not reach here"); - } - - static FlatVectorsReader getFlatVectorsReaderL99HNSW(Lucene99HnswVectorsReader reader) { - return (FlatVectorsReader) FLAT_VECTORS_READER_HNDL_L99HNSW.get(reader); - } - - // old codecs - @SuppressForbidden(reason = "static type is not accessible") - static Map getOffHeapByteSizeL90HNSW(Lucene90HnswVectorsReader reader, FieldInfo fieldInfo) { - try { - var entry = GET_FIELD_ENTRY_HANDLE_L90HNSW.invoke(reader, fieldInfo.name); - long graph = (long) GET_VECTOR_INDEX_LENGTH_HANDLE_L90HNSW.invoke(entry); - long raw = (long) GET_VECTOR_DATA_LENGTH_HANDLE_L90HNSW.invoke(entry); - return Map.of(HNSW_VECTOR_INDEX_EXTENSION, graph, FLAT_VECTOR_DATA_EXTENSION, raw); - } catch (Throwable t) { - handleThrowable(t); - } - throw new AssertionError("should not reach here"); - } - - @SuppressForbidden(reason = "static type is not accessible") - static Map getOffHeapByteSizeL91HNSW(Lucene91HnswVectorsReader reader, FieldInfo fieldInfo) { - try { - var entry = GET_FIELD_ENTRY_HANDLE_L91HNSW.invoke(reader, fieldInfo.name); - long graph = (long) GET_VECTOR_INDEX_LENGTH_HANDLE_L91HNSW.invoke(entry); - long raw = (long) GET_VECTOR_DATA_LENGTH_HANDLE_L91HNSW.invoke(entry); - return Map.of(HNSW_VECTOR_INDEX_EXTENSION, graph, FLAT_VECTOR_DATA_EXTENSION, raw); - } catch (Throwable t) { - handleThrowable(t); - } - throw new AssertionError("should not reach here"); - } - - @SuppressForbidden(reason = "static type is not accessible") - static Map getOffHeapByteSizeL92HNSW(Lucene92HnswVectorsReader reader, FieldInfo fieldInfo) { - try { - var entry = GET_FIELD_ENTRY_HANDLE_L92HNSW.invoke(reader, fieldInfo.name); - long graph = (long) GET_VECTOR_INDEX_LENGTH_HANDLE_L92HNSW.invoke(entry); - long raw = (long) GET_VECTOR_DATA_LENGTH_HANDLE_L92HNSW.invoke(entry); - return Map.of(HNSW_VECTOR_INDEX_EXTENSION, graph, FLAT_VECTOR_DATA_EXTENSION, raw); - } catch (Throwable t) { - handleThrowable(t); - } - throw new AssertionError("should not reach here"); - } - - @SuppressForbidden(reason = "static type is not accessible") - static Map getOffHeapByteSizeL94HNSW(Lucene94HnswVectorsReader reader, FieldInfo fieldInfo) { - try { - var entry = GET_FIELD_ENTRY_HANDLE_L94HNSW.invoke(reader, fieldInfo.name, fieldInfo.getVectorEncoding()); - long graph = (long) GET_VECTOR_INDEX_LENGTH_HANDLE_L94HNSW.invoke(entry); - long raw = (long) GET_VECTOR_DATA_LENGTH_HANDLE_L94HNSW.invoke(entry); - return Map.of(HNSW_VECTOR_INDEX_EXTENSION, graph, FLAT_VECTOR_DATA_EXTENSION, raw); - } catch (Throwable t) { - handleThrowable(t); - } - throw new AssertionError("should not reach here"); - } - - @SuppressForbidden(reason = "static type is not accessible") - static Map getOffHeapByteSizeL95HNSW(Lucene95HnswVectorsReader reader, FieldInfo fieldInfo) { - try { - var entry = GET_FIELD_ENTRY_HANDLE_L95HNSW.invoke(reader, fieldInfo.name, fieldInfo.getVectorEncoding()); - long graph = (long) GET_VECTOR_INDEX_LENGTH_HANDLE_L95HNSW.invoke(entry); - long raw = (long) GET_VECTOR_DATA_LENGTH_HANDLE_L95HNSW.invoke(entry); - return Map.of(HNSW_VECTOR_INDEX_EXTENSION, graph, FLAT_VECTOR_DATA_EXTENSION, raw); - } catch (Throwable t) { - handleThrowable(t); - } - throw new AssertionError("should not reach here"); - } - - private static void handleThrowable(Throwable t) { - if (t instanceof Error error) { - throw error; - } else if (t instanceof RuntimeException runtimeException) { - throw runtimeException; - } else { - throw new AssertionError(t); - } - } -} diff --git a/server/src/main/java/org/elasticsearch/index/engine/Engine.java b/server/src/main/java/org/elasticsearch/index/engine/Engine.java index ed6a462365222..5a1c49b54b7ac 100644 --- a/server/src/main/java/org/elasticsearch/index/engine/Engine.java +++ b/server/src/main/java/org/elasticsearch/index/engine/Engine.java @@ -68,7 +68,6 @@ import org.elasticsearch.index.VersionType; import org.elasticsearch.index.codec.FieldInfosWithUsages; import org.elasticsearch.index.codec.TrackingPostingsInMemoryBytesCodec; -import org.elasticsearch.index.codec.vectors.reflect.OffHeapByteSizeUtils; import org.elasticsearch.index.mapper.DocumentParser; import org.elasticsearch.index.mapper.LuceneDocument; import org.elasticsearch.index.mapper.Mapper; @@ -398,7 +397,7 @@ private DenseVectorStats getDenseVectorStats(final LeafReader atomicReader, List if (vectorsReader instanceof PerFieldKnnVectorsFormat.FieldsReader fieldsReader) { vectorsReader = fieldsReader.getFieldReader(info.name); } - Map offHeap = OffHeapByteSizeUtils.getOffHeapByteSize(vectorsReader, info); + Map offHeap = vectorsReader.getOffHeapByteSize(info); offHeapStats.put(info.name, offHeap); } } diff --git a/server/src/main/java/org/elasticsearch/index/engine/TranslogDirectoryReader.java b/server/src/main/java/org/elasticsearch/index/engine/TranslogDirectoryReader.java index 598fb076ba222..24b0512d27598 100644 --- a/server/src/main/java/org/elasticsearch/index/engine/TranslogDirectoryReader.java +++ b/server/src/main/java/org/elasticsearch/index/engine/TranslogDirectoryReader.java @@ -40,6 +40,7 @@ import org.apache.lucene.index.TermsEnum; import org.apache.lucene.index.VectorEncoding; import org.apache.lucene.index.VectorSimilarityFunction; +import org.apache.lucene.search.AcceptDocs; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.KnnCollector; import org.apache.lucene.store.ByteBuffersDirectory; @@ -447,12 +448,12 @@ public ByteVectorValues getByteVectorValues(String field) throws IOException { } @Override - public void searchNearestVectors(String field, float[] target, KnnCollector collector, Bits acceptDocs) throws IOException { + public void searchNearestVectors(String field, float[] target, KnnCollector collector, AcceptDocs acceptDocs) throws IOException { getDelegate().searchNearestVectors(field, target, collector, acceptDocs); } @Override - public void searchNearestVectors(String field, byte[] target, KnnCollector collector, Bits acceptDocs) throws IOException { + public void searchNearestVectors(String field, byte[] target, KnnCollector collector, AcceptDocs acceptDocs) throws IOException { getDelegate().searchNearestVectors(field, target, collector, acceptDocs); } diff --git a/server/src/main/java/org/elasticsearch/index/mapper/DocumentLeafReader.java b/server/src/main/java/org/elasticsearch/index/mapper/DocumentLeafReader.java index d37f6c51d288d..5a540785bd9fc 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/DocumentLeafReader.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/DocumentLeafReader.java @@ -34,6 +34,7 @@ import org.apache.lucene.index.VectorEncoding; import org.apache.lucene.index.VectorSimilarityFunction; import org.apache.lucene.index.memory.MemoryIndex; +import org.apache.lucene.search.AcceptDocs; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.KnnCollector; import org.apache.lucene.util.Bits; @@ -210,7 +211,7 @@ public FloatVectorValues getFloatVectorValues(String field) throws IOException { } @Override - public void searchNearestVectors(String field, float[] target, KnnCollector knnCollector, Bits acceptDocs) { + public void searchNearestVectors(String field, float[] target, KnnCollector knnCollector, AcceptDocs acceptDocs) { throw new UnsupportedOperationException(); } @@ -255,7 +256,7 @@ public ByteVectorValues getByteVectorValues(String field) { } @Override - public void searchNearestVectors(String field, byte[] target, KnnCollector knnCollector, Bits acceptDocs) { + public void searchNearestVectors(String field, byte[] target, KnnCollector knnCollector, AcceptDocs acceptDocs) { throw new UnsupportedOperationException(); } diff --git a/server/src/main/java/org/elasticsearch/index/shard/DenseVectorStats.java b/server/src/main/java/org/elasticsearch/index/shard/DenseVectorStats.java index 6e74647899080..e60f325e2f5b1 100644 --- a/server/src/main/java/org/elasticsearch/index/shard/DenseVectorStats.java +++ b/server/src/main/java/org/elasticsearch/index/shard/DenseVectorStats.java @@ -9,11 +9,11 @@ package org.elasticsearch.index.shard; +import org.apache.lucene.codecs.KnnVectorsReader; import org.elasticsearch.TransportVersions; import org.elasticsearch.common.io.stream.StreamInput; import org.elasticsearch.common.io.stream.StreamOutput; import org.elasticsearch.common.io.stream.Writeable; -import org.elasticsearch.index.codec.vectors.reflect.OffHeapByteSizeUtils; import org.elasticsearch.xcontent.ToXContentFragment; import org.elasticsearch.xcontent.XContentBuilder; @@ -92,7 +92,7 @@ public void add(DenseVectorStats other) { } else { this.offHeapStats = Stream.of(this.offHeapStats, other.offHeapStats) .flatMap(map -> map.entrySet().stream()) - .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue, OffHeapByteSizeUtils::mergeOffHeapByteSizeMaps)); + .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue, KnnVectorsReader::mergeOffHeapByteSizeMaps)); } } } diff --git a/server/src/main/java/org/elasticsearch/index/store/FsDirectoryFactory.java b/server/src/main/java/org/elasticsearch/index/store/FsDirectoryFactory.java index 3159fe94e289c..680fcdab9d14a 100644 --- a/server/src/main/java/org/elasticsearch/index/store/FsDirectoryFactory.java +++ b/server/src/main/java/org/elasticsearch/index/store/FsDirectoryFactory.java @@ -22,24 +22,29 @@ import org.apache.lucene.store.NativeFSLockFactory; import org.apache.lucene.store.ReadAdvice; import org.apache.lucene.store.SimpleFSLockFactory; +import org.elasticsearch.common.Strings; import org.elasticsearch.common.settings.Setting; import org.elasticsearch.common.settings.Setting.Property; import org.elasticsearch.common.util.FeatureFlag; import org.elasticsearch.core.IOUtils; import org.elasticsearch.index.IndexModule; import org.elasticsearch.index.IndexSettings; -import org.elasticsearch.index.codec.vectors.es818.DirectIOIndexInputSupplier; +import org.elasticsearch.index.StandardIOBehaviorHint; +import org.elasticsearch.index.codec.vectors.es818.DirectIOHint; import org.elasticsearch.index.shard.ShardPath; import org.elasticsearch.logging.LogManager; import org.elasticsearch.logging.Logger; import org.elasticsearch.plugins.IndexStorePlugin; import java.io.IOException; +import java.nio.file.FileSystemException; import java.nio.file.Files; import java.nio.file.Path; import java.util.HashSet; +import java.util.Optional; import java.util.OptionalLong; import java.util.Set; +import java.util.function.BiFunction; import java.util.function.BiPredicate; import static org.apache.lucene.store.MMapDirectory.SHARED_ARENA_MAX_PERMITS_SYSPROP; @@ -95,21 +100,13 @@ protected Directory newFSDirectory(Path location, LockFactory lockFactory, Index final FSDirectory primaryDirectory = FSDirectory.open(location, lockFactory); if (primaryDirectory instanceof MMapDirectory mMapDirectory) { mMapDirectory = adjustSharedArenaGrouping(mMapDirectory); - Directory dir = new HybridDirectory(lockFactory, setPreload(mMapDirectory, preLoadExtensions)); - if (MADV_RANDOM_FEATURE_FLAG.isEnabled() == false) { - dir = disableRandomAdvice(dir); - } - return dir; + return new HybridDirectory(lockFactory, setMMapFunctions(mMapDirectory, preLoadExtensions)); } else { return primaryDirectory; } case MMAPFS: MMapDirectory mMapDirectory = adjustSharedArenaGrouping(new MMapDirectory(location, lockFactory)); - Directory dir = setPreload(mMapDirectory, preLoadExtensions); - if (MADV_RANDOM_FEATURE_FLAG.isEnabled() == false) { - dir = disableRandomAdvice(dir); - } - return dir; + return setMMapFunctions(mMapDirectory, preLoadExtensions); case SIMPLEFS: case NIOFS: return new NIOFSDirectory(location, lockFactory); @@ -120,8 +117,9 @@ protected Directory newFSDirectory(Path location, LockFactory lockFactory, Index /** Sets the preload, if any, on the given directory based on the extensions. Returns the same directory instance. */ // visibility and extensibility for testing - public MMapDirectory setPreload(MMapDirectory mMapDirectory, Set preLoadExtensions) { + public MMapDirectory setMMapFunctions(MMapDirectory mMapDirectory, Set preLoadExtensions) { mMapDirectory.setPreload(getPreloadFunc(preLoadExtensions)); + mMapDirectory.setReadAdvice(getReadAdviceFunc()); return mMapDirectory; } @@ -144,20 +142,12 @@ static BiPredicate getPreloadFunc(Set preLoadExtensio return MMapDirectory.NO_FILES; } - /** - * Return a {@link FilterDirectory} around the provided {@link Directory} that forcefully disables {@link IOContext#readAdvice random - * access}. - */ - static Directory disableRandomAdvice(Directory dir) { - return new FilterDirectory(dir) { - @Override - public IndexInput openInput(String name, IOContext context) throws IOException { - if (context.readAdvice() == ReadAdvice.RANDOM) { - context = context.withReadAdvice(ReadAdvice.NORMAL); - } - assert context.readAdvice() != ReadAdvice.RANDOM; - return super.openInput(name, context); + private static BiFunction> getReadAdviceFunc() { + return (name, context) -> { + if (context.hints().contains(StandardIOBehaviorHint.INSTANCE)) { + return Optional.of(ReadAdvice.NORMAL); } + return MMapDirectory.ADVISE_BY_CONTEXT.apply(name, context); }; } @@ -169,7 +159,7 @@ public static boolean isHybridFs(Directory directory) { return unwrap instanceof HybridDirectory; } - static final class HybridDirectory extends NIOFSDirectory implements DirectIOIndexInputSupplier { + static final class HybridDirectory extends NIOFSDirectory { private final MMapDirectory delegate; private final DirectIODirectory directIODelegate; @@ -196,32 +186,39 @@ protected boolean useDirectIO(String name, IOContext context, OptionalLong fileL @Override public IndexInput openInput(String name, IOContext context) throws IOException { - if (useDelegate(name, context)) { - // we need to do these checks on the outer directory since the inner doesn't know about pending deletes + Throwable directIOException = null; + if (directIODelegate != null && context.hints().contains(DirectIOHint.INSTANCE)) { ensureOpen(); ensureCanRead(name); - // we switch the context here since mmap checks for the READONCE context by identity - context = context == Store.READONCE_CHECKSUM ? IOContext.READONCE : context; - // we only use the mmap to open inputs. Everything else is managed by the NIOFSDirectory otherwise - // we might run into trouble with files that are pendingDelete in one directory but still - // listed in listAll() from the other. We on the other hand don't want to list files from both dirs - // and intersect for perf reasons. - return delegate.openInput(name, context); - } else { - return super.openInput(name, context); + try { + Log.debug("Opening {} with direct IO", name); + return directIODelegate.openInput(name, context); + } catch (FileSystemException e) { + Log.debug(() -> Strings.format("Could not open %s with direct IO", name), e); + directIOException = e; + // and fallthrough to normal opening below + } } - } - @Override - public IndexInput openInputDirect(String name, IOContext context) throws IOException { - if (directIODelegate == null) { - return openInput(name, context); + try { + if (useDelegate(name, context)) { + // we need to do these checks on the outer directory since the inner doesn't know about pending deletes + ensureOpen(); + ensureCanRead(name); + // we only use the mmap to open inputs. Everything else is managed by the NIOFSDirectory otherwise + // we might run into trouble with files that are pendingDelete in one directory but still + // listed in listAll() from the other. We on the other hand don't want to list files from both dirs + // and intersect for perf reasons. + return delegate.openInput(name, context); + } else { + return super.openInput(name, context); + } + } catch (Throwable t) { + if (directIOException != null) { + t.addSuppressed(directIOException); + } + throw t; } - // we need to do these checks on the outer directory since the inner doesn't know about pending deletes - ensureOpen(); - ensureCanRead(name); - Log.debug("Opening {} with direct IO", name); - return directIODelegate.openInput(name, context); } @Override @@ -240,7 +237,7 @@ private static String getExtension(String name) { } static boolean useDelegate(String name, IOContext ioContext) { - if (ioContext == Store.READONCE_CHECKSUM) { + if (ioContext.hints().contains(Store.FileFooterOnly.INSTANCE)) { // If we're just reading the footer for the checksum then mmap() isn't really necessary, and it's desperately inefficient // if pre-loading is enabled on this file. return false; diff --git a/server/src/main/java/org/elasticsearch/index/store/Store.java b/server/src/main/java/org/elasticsearch/index/store/Store.java index af28bc3bb32d3..27d186137958b 100644 --- a/server/src/main/java/org/elasticsearch/index/store/Store.java +++ b/server/src/main/java/org/elasticsearch/index/store/Store.java @@ -27,13 +27,14 @@ import org.apache.lucene.store.BufferedChecksum; import org.apache.lucene.store.ByteArrayDataInput; import org.apache.lucene.store.ChecksumIndexInput; +import org.apache.lucene.store.DataAccessHint; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IOContext; import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.IndexOutput; import org.apache.lucene.store.Lock; import org.apache.lucene.store.NIOFSDirectory; -import org.apache.lucene.store.ReadAdvice; +import org.apache.lucene.store.ReadOnceHint; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.Version; @@ -145,19 +146,22 @@ public class Store extends AbstractIndexShardComponent implements Closeable, Ref Property.IndexScope ); + /** + * A {@link org.apache.lucene.store.IOContext.FileOpenHint} that we will only read the Lucene file footer + */ + public enum FileFooterOnly implements IOContext.FileOpenHint { + INSTANCE + } + /** * Specific {@link IOContext} indicating that we will read only the Lucene file footer (containing the file checksum) * See {@link MetadataSnapshot#checksumFromLuceneFile}. */ - public static final IOContext READONCE_CHECKSUM = createReadOnceContext(); - - // while equivalent, these different read once contexts are checked by identity in directory implementations - private static IOContext createReadOnceContext() { - var context = IOContext.READONCE.withReadAdvice(ReadAdvice.SEQUENTIAL); - assert context != IOContext.READONCE; - assert context.equals(IOContext.READONCE); - return context; - } + public static final IOContext READONCE_CHECKSUM = IOContext.READONCE.withHints( + DataAccessHint.SEQUENTIAL, + ReadOnceHint.INSTANCE, + FileFooterOnly.INSTANCE + ); private final AtomicBoolean isClosed = new AtomicBoolean(false); private final StoreDirectory directory; @@ -934,8 +938,6 @@ private static void checksumFromLuceneFile( boolean readFileAsHash, BytesRef writerUuid ) throws IOException { - // We select the read once context carefully here since these constants, while equivalent are - // checked by identity in the different directory implementations. var context = file.startsWith(IndexFileNames.SEGMENTS) ? IOContext.READONCE : READONCE_CHECKSUM; try (IndexInput in = directory.openInput(file, context)) { final long length = in.length(); diff --git a/server/src/main/java/org/elasticsearch/search/internal/ExitableDirectoryReader.java b/server/src/main/java/org/elasticsearch/search/internal/ExitableDirectoryReader.java index 22c93032aecbe..7ce82290e206d 100644 --- a/server/src/main/java/org/elasticsearch/search/internal/ExitableDirectoryReader.java +++ b/server/src/main/java/org/elasticsearch/search/internal/ExitableDirectoryReader.java @@ -10,6 +10,7 @@ package org.elasticsearch.search.internal; import org.apache.lucene.codecs.StoredFieldsReader; +import org.apache.lucene.codecs.lucene90.IndexedDISI; import org.apache.lucene.index.ByteVectorValues; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.FilterDirectoryReader; @@ -22,11 +23,11 @@ import org.apache.lucene.index.QueryTimeout; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.search.AcceptDocs; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.KnnCollector; import org.apache.lucene.search.VectorScorer; import org.apache.lucene.search.suggest.document.CompletionTerms; -import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.automaton.CompiledAutomaton; import org.elasticsearch.common.lucene.index.SequentialStoredFieldsLeafReader; @@ -140,7 +141,7 @@ public ByteVectorValues getByteVectorValues(String field) throws IOException { } @Override - public void searchNearestVectors(String field, byte[] target, KnnCollector collector, Bits acceptDocs) throws IOException { + public void searchNearestVectors(String field, byte[] target, KnnCollector collector, AcceptDocs acceptDocs) throws IOException { if (queryCancellation.isEnabled() == false) { in.searchNearestVectors(field, target, collector, acceptDocs); return; @@ -158,7 +159,7 @@ public FloatVectorValues getFloatVectorValues(String field) throws IOException { } @Override - public void searchNearestVectors(String field, float[] target, KnnCollector collector, Bits acceptDocs) throws IOException { + public void searchNearestVectors(String field, float[] target, KnnCollector collector, AcceptDocs acceptDocs) throws IOException { if (queryCancellation.isEnabled() == false) { in.searchNearestVectors(field, target, collector, acceptDocs); return; @@ -488,8 +489,9 @@ public VectorScorer scorer(byte[] bytes) throws IOException { if (scorer == null) { return null; } + DocIdSetIterator scorerIterator = scorer.iterator(); return new VectorScorer() { - private final DocIdSetIterator iterator = new ExitableDocSetIterator(scorer.iterator(), queryCancellation); + private final DocIdSetIterator iterator = exitableIterator(scorerIterator, queryCancellation); @Override public float score() throws IOException { @@ -539,8 +541,9 @@ public VectorScorer scorer(float[] target) throws IOException { if (scorer == null) { return null; } + DocIdSetIterator scorerIterator = scorer.iterator(); return new VectorScorer() { - private final DocIdSetIterator iterator = new ExitableDocSetIterator(scorer.iterator(), queryCancellation); + private final DocIdSetIterator iterator = exitableIterator(scorerIterator, queryCancellation); @Override public float score() throws IOException { @@ -565,6 +568,17 @@ public FloatVectorValues copy() throws IOException { } } + /** Wraps the iterator in an exitable iterator, specializing for KnnVectorValues.DocIndexIterator. */ + static DocIdSetIterator exitableIterator(DocIdSetIterator iterator, QueryCancellation queryCancellation) { + if (iterator instanceof KnnVectorValues.DocIndexIterator docIndexIterator) { + return createExitableIterator(docIndexIterator, queryCancellation); + } else if (iterator instanceof IndexedDISI indexedDISI) { + return createExitableIterator(IndexedDISI.asDocIndexIterator(indexedDISI), queryCancellation); + } else { + return new ExitableDocSetIterator(iterator, queryCancellation); + } + } + private static KnnVectorValues.DocIndexIterator createExitableIterator( KnnVectorValues.DocIndexIterator delegate, QueryCancellation queryCancellation diff --git a/server/src/main/java/org/elasticsearch/search/internal/FieldUsageTrackingDirectoryReader.java b/server/src/main/java/org/elasticsearch/search/internal/FieldUsageTrackingDirectoryReader.java index f03be3f09b7d2..8143e6ef28053 100644 --- a/server/src/main/java/org/elasticsearch/search/internal/FieldUsageTrackingDirectoryReader.java +++ b/server/src/main/java/org/elasticsearch/search/internal/FieldUsageTrackingDirectoryReader.java @@ -30,9 +30,9 @@ import org.apache.lucene.index.TermVectors; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.search.AcceptDocs; import org.apache.lucene.search.KnnCollector; import org.apache.lucene.search.suggest.document.CompletionTerms; -import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.automaton.CompiledAutomaton; import org.elasticsearch.common.lucene.index.SequentialStoredFieldsLeafReader; @@ -221,7 +221,7 @@ public ByteVectorValues getByteVectorValues(String field) throws IOException { } @Override - public void searchNearestVectors(String field, byte[] target, KnnCollector collector, Bits acceptDocs) throws IOException { + public void searchNearestVectors(String field, byte[] target, KnnCollector collector, AcceptDocs acceptDocs) throws IOException { super.searchNearestVectors(field, target, collector, acceptDocs); if (collector.visitedCount() > 0) { notifier.onKnnVectorsUsed(field); @@ -229,7 +229,7 @@ public void searchNearestVectors(String field, byte[] target, KnnCollector colle } @Override - public void searchNearestVectors(String field, float[] target, KnnCollector collector, Bits acceptDocs) throws IOException { + public void searchNearestVectors(String field, float[] target, KnnCollector collector, AcceptDocs acceptDocs) throws IOException { super.searchNearestVectors(field, target, collector, acceptDocs); if (collector.visitedCount() > 0) { notifier.onKnnVectorsUsed(field); diff --git a/server/src/main/java/org/elasticsearch/search/vectors/AbstractIVFKnnVectorQuery.java b/server/src/main/java/org/elasticsearch/search/vectors/AbstractIVFKnnVectorQuery.java index 00e083e0a6781..08825c55029fc 100644 --- a/server/src/main/java/org/elasticsearch/search/vectors/AbstractIVFKnnVectorQuery.java +++ b/server/src/main/java/org/elasticsearch/search/vectors/AbstractIVFKnnVectorQuery.java @@ -15,11 +15,10 @@ import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.search.AcceptDocs; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanQuery; -import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.FieldExistsQuery; -import org.apache.lucene.search.FilteredDocIdSetIterator; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.MatchNoDocsQuery; import org.apache.lucene.search.Query; @@ -33,9 +32,6 @@ import org.apache.lucene.search.Weight; import org.apache.lucene.search.knn.KnnCollectorManager; import org.apache.lucene.search.knn.KnnSearchStrategy; -import org.apache.lucene.util.BitSet; -import org.apache.lucene.util.BitSetIterator; -import org.apache.lucene.util.Bits; import org.elasticsearch.search.profile.query.QueryProfiler; import java.io.IOException; @@ -186,10 +182,10 @@ private TopDocs searchLeaf(LeafReaderContext ctx, Weight filterWeight, IVFCollec TopDocs getLeafResults(LeafReaderContext ctx, Weight filterWeight, IVFCollectorManager knnCollectorManager, float visitRatio) throws IOException { final LeafReader reader = ctx.reader(); - final Bits liveDocs = reader.getLiveDocs(); if (filterWeight == null) { - return approximateSearch(ctx, liveDocs, Integer.MAX_VALUE, knnCollectorManager, visitRatio); + AcceptDocs acceptDocs = AcceptDocs.fromLiveDocs(reader.getLiveDocs(), reader.maxDoc()); + return approximateSearch(ctx, acceptDocs, Integer.MAX_VALUE, knnCollectorManager, visitRatio); } Scorer scorer = filterWeight.scorer(ctx); @@ -197,14 +193,14 @@ TopDocs getLeafResults(LeafReaderContext ctx, Weight filterWeight, IVFCollectorM return TopDocsCollector.EMPTY_TOPDOCS; } - BitSet acceptDocs = createBitSet(scorer.iterator(), liveDocs, reader.maxDoc()); - final int cost = acceptDocs.cardinality(); + AcceptDocs acceptDocs = AcceptDocs.fromIteratorSupplier(scorer::iterator, reader.getLiveDocs(), reader.maxDoc()); + final int cost = acceptDocs.cost(); return approximateSearch(ctx, acceptDocs, cost + 1, knnCollectorManager, visitRatio); } abstract TopDocs approximateSearch( LeafReaderContext context, - Bits acceptDocs, + AcceptDocs acceptDocs, int visitedLimit, IVFCollectorManager knnCollectorManager, float visitRatio @@ -219,22 +215,6 @@ public final void profile(QueryProfiler queryProfiler) { queryProfiler.addVectorOpsCount(vectorOpsCount); } - BitSet createBitSet(DocIdSetIterator iterator, Bits liveDocs, int maxDoc) throws IOException { - if (liveDocs == null && iterator instanceof BitSetIterator bitSetIterator) { - // If we already have a BitSet and no deletions, reuse the BitSet - return bitSetIterator.getBitSet(); - } else { - // Create a new BitSet from matching and live docs - FilteredDocIdSetIterator filterIterator = new FilteredDocIdSetIterator(iterator) { - @Override - protected boolean match(int doc) { - return liveDocs == null || liveDocs.get(doc); - } - }; - return BitSet.of(filterIterator, maxDoc); - } - } - static class IVFCollectorManager implements KnnCollectorManager { private final int k; final LongAccumulator longAccumulator; diff --git a/server/src/main/java/org/elasticsearch/search/vectors/IVFKnnFloatVectorQuery.java b/server/src/main/java/org/elasticsearch/search/vectors/IVFKnnFloatVectorQuery.java index da452ecc992db..e31cf6cade3f2 100644 --- a/server/src/main/java/org/elasticsearch/search/vectors/IVFKnnFloatVectorQuery.java +++ b/server/src/main/java/org/elasticsearch/search/vectors/IVFKnnFloatVectorQuery.java @@ -11,9 +11,9 @@ import org.apache.lucene.index.FloatVectorValues; import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.search.AcceptDocs; import org.apache.lucene.search.Query; import org.apache.lucene.search.TopDocs; -import org.apache.lucene.util.Bits; import java.io.IOException; import java.util.Arrays; @@ -73,7 +73,7 @@ public int hashCode() { @Override protected TopDocs approximateSearch( LeafReaderContext context, - Bits acceptDocs, + AcceptDocs acceptDocs, int visitedLimit, IVFCollectorManager knnCollectorManager, float visitRatio diff --git a/server/src/main/resources/META-INF/services/org.apache.lucene.codecs.Codec b/server/src/main/resources/META-INF/services/org.apache.lucene.codecs.Codec index 1fbdaea9c772a..971db6dcc032c 100644 --- a/server/src/main/resources/META-INF/services/org.apache.lucene.codecs.Codec +++ b/server/src/main/resources/META-INF/services/org.apache.lucene.codecs.Codec @@ -2,3 +2,4 @@ org.elasticsearch.index.codec.Elasticsearch814Codec org.elasticsearch.index.codec.Elasticsearch816Codec org.elasticsearch.index.codec.Elasticsearch900Codec org.elasticsearch.index.codec.Elasticsearch900Lucene101Codec +org.elasticsearch.index.codec.Elasticsearch92Lucene103Codec diff --git a/server/src/main/resources/org/elasticsearch/bootstrap/security.policy b/server/src/main/resources/org/elasticsearch/bootstrap/security.policy index 4f3bc1f92060b..55abdc84fc8fb 100644 --- a/server/src/main/resources/org/elasticsearch/bootstrap/security.policy +++ b/server/src/main/resources/org/elasticsearch/bootstrap/security.policy @@ -32,10 +32,6 @@ grant codeBase "${codebase.elasticsearch}" { // for plugin api dynamic settings instances permission java.lang.RuntimePermission "accessClassInPackage.jdk.internal.reflect"; - - // For vector off-heap statistics, remove in Lucene 10.3 - permission java.lang.RuntimePermission "accessDeclaredMembers"; - permission java.lang.reflect.ReflectPermission "suppressAccessChecks"; }; //// Very special jar permissions: diff --git a/server/src/test/java/org/elasticsearch/action/admin/indices/diskusage/IndexDiskUsageAnalyzerTests.java b/server/src/test/java/org/elasticsearch/action/admin/indices/diskusage/IndexDiskUsageAnalyzerTests.java index fdcfe5e3720f8..57de001ef90e5 100644 --- a/server/src/test/java/org/elasticsearch/action/admin/indices/diskusage/IndexDiskUsageAnalyzerTests.java +++ b/server/src/test/java/org/elasticsearch/action/admin/indices/diskusage/IndexDiskUsageAnalyzerTests.java @@ -12,7 +12,7 @@ import org.apache.lucene.codecs.DocValuesFormat; import org.apache.lucene.codecs.KnnVectorsFormat; import org.apache.lucene.codecs.PostingsFormat; -import org.apache.lucene.codecs.lucene101.Lucene101Codec; +import org.apache.lucene.codecs.lucene103.Lucene103Codec; import org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat; import org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat; import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat; @@ -24,6 +24,7 @@ import org.apache.lucene.document.Field; import org.apache.lucene.document.FieldType; import org.apache.lucene.document.IntPoint; +import org.apache.lucene.document.KnnByteVectorField; import org.apache.lucene.document.KnnFloatVectorField; import org.apache.lucene.document.LatLonShape; import org.apache.lucene.document.LongPoint; @@ -67,6 +68,7 @@ import org.elasticsearch.common.lucene.Lucene; import org.elasticsearch.core.IOUtils; import org.elasticsearch.index.codec.postings.ES812PostingsFormat; +import org.elasticsearch.index.mapper.vectors.DenseVectorFieldMapper; import org.elasticsearch.index.shard.ShardId; import org.elasticsearch.index.store.LuceneFilesExtensions; import org.elasticsearch.test.ESTestCase; @@ -254,15 +256,27 @@ public void testKnnVectors() throws Exception { VectorSimilarityFunction similarity = randomFrom(VectorSimilarityFunction.values()); int numDocs = between(1000, 5000); int dimension = between(10, 200); + DenseVectorFieldMapper.ElementType elementType = randomFrom(DenseVectorFieldMapper.ElementType.values()); - indexRandomly(dir, codec, numDocs, doc -> { - float[] vector = randomVector(dimension); - doc.add(new KnnFloatVectorField("vector", vector, similarity)); - }); + if (elementType == DenseVectorFieldMapper.ElementType.FLOAT) { + indexRandomly(dir, codec, numDocs, doc -> { + float[] vector = randomVector(dimension); + doc.add(new KnnFloatVectorField("vector", vector, similarity)); + }); + } else { + indexRandomly(dir, codec, numDocs, doc -> { + byte[] vector = new byte[dimension]; + random().nextBytes(vector); + doc.add(new KnnByteVectorField("vector", vector, similarity)); + }); + } final IndexDiskUsageStats stats = IndexDiskUsageAnalyzer.analyze(testShardId(), lastCommit(dir), () -> {}); logger.info("--> stats {}", stats); - long dataBytes = (long) numDocs * dimension * Float.BYTES; // size of flat vector data + // expected size of flat vector data + long dataBytes = elementType == DenseVectorFieldMapper.ElementType.FLOAT + ? ((long) numDocs * dimension * Float.BYTES) + : ((long) numDocs * dimension); long indexBytesEstimate = (long) numDocs * (Lucene99HnswVectorsFormat.DEFAULT_MAX_CONN / 4); // rough size of HNSW graph assertThat("numDocs=" + numDocs + ";dimension=" + dimension, stats.total().getKnnVectorsBytes(), greaterThan(dataBytes)); long connectionOverhead = stats.total().getKnnVectorsBytes() - dataBytes; @@ -326,7 +340,7 @@ public void testTriangle() throws Exception { public void testCompletionField() throws Exception { IndexWriterConfig config = new IndexWriterConfig().setCommitOnClose(true) .setUseCompoundFile(false) - .setCodec(new Lucene101Codec(Lucene101Codec.Mode.BEST_SPEED) { + .setCodec(new Lucene103Codec(Lucene103Codec.Mode.BEST_SPEED) { @Override public PostingsFormat getPostingsFormatForField(String field) { if (field.startsWith("suggest_")) { @@ -432,25 +446,25 @@ private static void addFieldsToDoc(Document doc, IndexableField[] fields) { enum CodecMode { BEST_SPEED { @Override - Lucene101Codec.Mode mode() { - return Lucene101Codec.Mode.BEST_SPEED; + Lucene103Codec.Mode mode() { + return Lucene103Codec.Mode.BEST_SPEED; } }, BEST_COMPRESSION { @Override - Lucene101Codec.Mode mode() { - return Lucene101Codec.Mode.BEST_COMPRESSION; + Lucene103Codec.Mode mode() { + return Lucene103Codec.Mode.BEST_COMPRESSION; } }; - abstract Lucene101Codec.Mode mode(); + abstract Lucene103Codec.Mode mode(); } static void indexRandomly(Directory directory, CodecMode codecMode, int numDocs, Consumer addFields) throws IOException { IndexWriterConfig config = new IndexWriterConfig().setCommitOnClose(true) .setUseCompoundFile(randomBoolean()) - .setCodec(new Lucene101Codec(codecMode.mode())); + .setCodec(new Lucene103Codec(codecMode.mode())); try (IndexWriter writer = new IndexWriter(directory, config)) { for (int i = 0; i < numDocs; i++) { final Document doc = new Document(); @@ -662,7 +676,7 @@ static void rewriteIndexWithPerFieldCodec(Directory source, CodecMode mode, Dire try (DirectoryReader reader = DirectoryReader.open(source)) { IndexWriterConfig config = new IndexWriterConfig().setSoftDeletesField(Lucene.SOFT_DELETES_FIELD) .setUseCompoundFile(randomBoolean()) - .setCodec(new Lucene101Codec(mode.mode()) { + .setCodec(new Lucene103Codec(mode.mode()) { @Override public PostingsFormat getPostingsFormatForField(String field) { return new ES812PostingsFormat(); @@ -762,8 +776,9 @@ private static void assertStats(IndexDiskUsageStats actualStats, IndexDiskUsageS 0.01, 2048 ); - - assertFieldStats(field, "knn vectors", actualField.getKnnVectorsBytes(), expectedField.getKnnVectorsBytes(), 0.01, 1024); + // Allow difference of a file block size for knn vectors + // we get knn data usage from getOffHeapByteSize but when written on disk it can be rounded to the next block size + assertFieldStats(field, "knn vectors", actualField.getKnnVectorsBytes(), expectedField.getKnnVectorsBytes(), 0.01, 4096); } // We are not able to collect per field stats for stored, vector, points, and norms IndexDiskUsageStats.PerFieldDiskUsage actualTotal = actualStats.total(); diff --git a/server/src/test/java/org/elasticsearch/common/lucene/search/function/MinScoreScorerTests.java b/server/src/test/java/org/elasticsearch/common/lucene/search/function/MinScoreScorerTests.java index 55ca666d8588b..b44c9e06bc059 100644 --- a/server/src/test/java/org/elasticsearch/common/lucene/search/function/MinScoreScorerTests.java +++ b/server/src/test/java/org/elasticsearch/common/lucene/search/function/MinScoreScorerTests.java @@ -142,7 +142,7 @@ public float getMaxScore(int upTo) throws IOException { random(), new ScoreMode[] { ScoreMode.COMPLETE, ScoreMode.TOP_SCORES, ScoreMode.TOP_DOCS_WITH_SCORES } ); - final Scorer assertingScorer = AssertingScorer.wrap(random(), scorer, scoreMode, true); + final Scorer assertingScorer = AssertingScorer.wrap(scorer, scoreMode.needsScores(), true); if (twoPhase && randomBoolean()) { return hideTwoPhaseIterator(assertingScorer); } else { diff --git a/server/src/test/java/org/elasticsearch/index/codec/CodecTests.java b/server/src/test/java/org/elasticsearch/index/codec/CodecTests.java index 23ee616d54231..a2ff440facaf0 100644 --- a/server/src/test/java/org/elasticsearch/index/codec/CodecTests.java +++ b/server/src/test/java/org/elasticsearch/index/codec/CodecTests.java @@ -50,7 +50,7 @@ public void testResolveDefaultCodecs() throws Exception { assumeTrue("Only when zstd_stored_fields feature flag is enabled", CodecService.ZSTD_STORED_FIELDS_FEATURE_FLAG); CodecService codecService = createCodecService(); assertThat(codecService.codec("default"), instanceOf(PerFieldMapperCodec.class)); - assertThat(codecService.codec("default"), instanceOf(Elasticsearch900Lucene101Codec.class)); + assertThat(codecService.codec("default"), instanceOf(Elasticsearch92Lucene103Codec.class)); } public void testDefault() throws Exception { diff --git a/server/src/test/java/org/elasticsearch/index/codec/PerFieldMapperCodecTests.java b/server/src/test/java/org/elasticsearch/index/codec/PerFieldMapperCodecTests.java index e281f5b09d520..f291d85e0bcc6 100644 --- a/server/src/test/java/org/elasticsearch/index/codec/PerFieldMapperCodecTests.java +++ b/server/src/test/java/org/elasticsearch/index/codec/PerFieldMapperCodecTests.java @@ -10,7 +10,7 @@ package org.elasticsearch.index.codec; import org.apache.lucene.codecs.PostingsFormat; -import org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat; +import org.apache.lucene.codecs.lucene103.Lucene103PostingsFormat; import org.elasticsearch.cluster.metadata.IndexMetadata; import org.elasticsearch.common.compress.CompressedXContent; import org.elasticsearch.common.settings.Settings; @@ -98,7 +98,7 @@ public void testUseBloomFilter() throws IOException { assertThat(perFieldMapperCodec.getPostingsFormatForField("_id"), instanceOf(ES87BloomFilterPostingsFormat.class)); assertThat(perFieldMapperCodec.useBloomFilter("another_field"), is(false)); - Class expectedPostingsFormat = timeSeries ? ES812PostingsFormat.class : Lucene101PostingsFormat.class; + Class expectedPostingsFormat = timeSeries ? ES812PostingsFormat.class : Lucene103PostingsFormat.class; assertThat(perFieldMapperCodec.getPostingsFormatForField("another_field"), instanceOf(expectedPostingsFormat)); } @@ -113,7 +113,7 @@ public void testUseBloomFilterWithTimestampFieldEnabled() throws IOException { public void testUseBloomFilterWithTimestampFieldEnabled_noTimeSeriesMode() throws IOException { PerFieldFormatSupplier perFieldMapperCodec = createFormatSupplier(true, false, false); assertThat(perFieldMapperCodec.useBloomFilter("_id"), is(false)); - assertThat(perFieldMapperCodec.getPostingsFormatForField("_id"), instanceOf(Lucene101PostingsFormat.class)); + assertThat(perFieldMapperCodec.getPostingsFormatForField("_id"), instanceOf(Lucene103PostingsFormat.class)); } public void testUseBloomFilterWithTimestampFieldEnabled_disableBloomFilter() throws IOException { diff --git a/server/src/test/java/org/elasticsearch/index/codec/postings/ES812PostingsFormatTests.java b/server/src/test/java/org/elasticsearch/index/codec/postings/ES812PostingsFormatTests.java index b11ab47102288..f59e075d6ec5a 100644 --- a/server/src/test/java/org/elasticsearch/index/codec/postings/ES812PostingsFormatTests.java +++ b/server/src/test/java/org/elasticsearch/index/codec/postings/ES812PostingsFormatTests.java @@ -19,10 +19,10 @@ */ package org.elasticsearch.index.codec.postings; +import org.apache.lucene.backward_codecs.lucene90.blocktree.FieldReader; +import org.apache.lucene.backward_codecs.lucene90.blocktree.Stats; import org.apache.lucene.codecs.Codec; import org.apache.lucene.codecs.CompetitiveImpactAccumulator; -import org.apache.lucene.codecs.lucene90.blocktree.FieldReader; -import org.apache.lucene.codecs.lucene90.blocktree.Stats; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.DirectoryReader; diff --git a/server/src/test/java/org/elasticsearch/index/codec/tsdb/DocValuesCodecDuelTests.java b/server/src/test/java/org/elasticsearch/index/codec/tsdb/DocValuesCodecDuelTests.java index 1036d822c0a21..ee9351ed51b97 100644 --- a/server/src/test/java/org/elasticsearch/index/codec/tsdb/DocValuesCodecDuelTests.java +++ b/server/src/test/java/org/elasticsearch/index/codec/tsdb/DocValuesCodecDuelTests.java @@ -26,7 +26,7 @@ import org.apache.lucene.tests.index.RandomIndexWriter; import org.apache.lucene.tests.util.TestUtil; import org.apache.lucene.util.BytesRef; -import org.elasticsearch.index.codec.Elasticsearch900Lucene101Codec; +import org.elasticsearch.index.codec.Elasticsearch92Lucene103Codec; import org.elasticsearch.index.codec.tsdb.ES87TSDBDocValuesFormatTests.TestES87TSDBDocValuesFormat; import org.elasticsearch.index.codec.tsdb.es819.ES819TSDBDocValuesFormat; import org.elasticsearch.test.ESTestCase; @@ -55,7 +55,7 @@ public void testDuel() throws IOException { baselineConfig.setCodec(TestUtil.alwaysDocValuesFormat(new Lucene90DocValuesFormat())); var contenderConf = newIndexWriterConfig(); contenderConf.setMergePolicy(mergePolicy); - Codec codec = new Elasticsearch900Lucene101Codec() { + Codec codec = new Elasticsearch92Lucene103Codec() { final DocValuesFormat docValuesFormat = randomBoolean() ? new ES819TSDBDocValuesFormat( diff --git a/server/src/test/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesFormatTests.java b/server/src/test/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesFormatTests.java index daedfce7b71e1..9556c949f0fd9 100644 --- a/server/src/test/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesFormatTests.java +++ b/server/src/test/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesFormatTests.java @@ -12,7 +12,6 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.codecs.Codec; import org.apache.lucene.codecs.DocValuesConsumer; -import org.apache.lucene.codecs.DocValuesFormat; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.SortedDocValuesField; @@ -34,9 +33,9 @@ import org.apache.lucene.tests.analysis.MockAnalyzer; import org.apache.lucene.tests.index.BaseDocValuesFormatTestCase; import org.apache.lucene.tests.index.RandomIndexWriter; +import org.apache.lucene.tests.util.TestUtil; import org.apache.lucene.util.BytesRef; import org.elasticsearch.common.logging.LogConfigurator; -import org.elasticsearch.index.codec.Elasticsearch900Lucene101Codec; import java.io.IOException; import java.util.ArrayList; @@ -53,7 +52,6 @@ public class ES87TSDBDocValuesFormatTests extends BaseDocValuesFormatTestCase { private static final int NUM_DOCS = 10; static { - // For Elasticsearch900Lucene101Codec: LogConfigurator.loadLog4jPlugins(); LogConfigurator.configureESLogging(); } @@ -74,13 +72,7 @@ public DocValuesConsumer fieldsConsumer(SegmentWriteState state) throws IOExcept } } - private final Codec codec = new Elasticsearch900Lucene101Codec() { - - @Override - public DocValuesFormat getDocValuesFormatForField(String field) { - return new TestES87TSDBDocValuesFormat(); - } - }; + private final Codec codec = TestUtil.alwaysDocValuesFormat(new TestES87TSDBDocValuesFormat()); @Override protected Codec getCodec() { diff --git a/server/src/test/java/org/elasticsearch/index/codec/tsdb/TsdbDocValueBwcTests.java b/server/src/test/java/org/elasticsearch/index/codec/tsdb/TsdbDocValueBwcTests.java index 37a62b3605c2c..d2c8aae601977 100644 --- a/server/src/test/java/org/elasticsearch/index/codec/tsdb/TsdbDocValueBwcTests.java +++ b/server/src/test/java/org/elasticsearch/index/codec/tsdb/TsdbDocValueBwcTests.java @@ -38,7 +38,7 @@ import org.elasticsearch.cluster.metadata.DataStream; import org.elasticsearch.core.SuppressForbidden; import org.elasticsearch.index.codec.Elasticsearch816Codec; -import org.elasticsearch.index.codec.Elasticsearch900Lucene101Codec; +import org.elasticsearch.index.codec.Elasticsearch92Lucene103Codec; import org.elasticsearch.index.codec.perfield.XPerFieldDocValuesFormat; import org.elasticsearch.index.codec.tsdb.ES87TSDBDocValuesFormatTests.TestES87TSDBDocValuesFormat; import org.elasticsearch.index.codec.tsdb.es819.ES819TSDBDocValuesFormat; @@ -72,7 +72,7 @@ public DocValuesFormat getDocValuesFormatForField(String field) { return docValuesFormat; } }; - var newCodec = new Elasticsearch900Lucene101Codec() { + var newCodec = new Elasticsearch92Lucene103Codec() { final DocValuesFormat docValuesFormat = new ES819TSDBDocValuesFormat(); diff --git a/server/src/test/java/org/elasticsearch/index/codec/tsdb/es819/ES819TSDBDocValuesFormatTests.java b/server/src/test/java/org/elasticsearch/index/codec/tsdb/es819/ES819TSDBDocValuesFormatTests.java index ea29b1cbf1356..003124ab4b6f4 100644 --- a/server/src/test/java/org/elasticsearch/index/codec/tsdb/es819/ES819TSDBDocValuesFormatTests.java +++ b/server/src/test/java/org/elasticsearch/index/codec/tsdb/es819/ES819TSDBDocValuesFormatTests.java @@ -28,6 +28,7 @@ import org.apache.lucene.index.LogByteSizeMergePolicy; import org.apache.lucene.index.NumericDocValues; import org.apache.lucene.index.SortedDocValues; +import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Sort; import org.apache.lucene.search.SortField; @@ -40,6 +41,7 @@ import org.elasticsearch.common.lucene.BytesRefs; import org.elasticsearch.common.util.CollectionUtils; import org.elasticsearch.index.codec.Elasticsearch900Lucene101Codec; +import org.elasticsearch.index.codec.Elasticsearch92Lucene103Codec; import org.elasticsearch.index.codec.tsdb.ES87TSDBDocValuesFormatTests; import org.elasticsearch.index.codec.tsdb.es819.ES819TSDBDocValuesProducer.BaseDenseNumericValues; import org.elasticsearch.index.codec.tsdb.es819.ES819TSDBDocValuesProducer.BaseSortedDocValues; @@ -60,12 +62,13 @@ import java.util.function.Supplier; import java.util.stream.IntStream; +import static org.elasticsearch.test.ESTestCase.randomFrom; import static org.hamcrest.Matchers.equalTo; import static org.hamcrest.Matchers.instanceOf; public class ES819TSDBDocValuesFormatTests extends ES87TSDBDocValuesFormatTests { - private final Codec codec = new Elasticsearch900Lucene101Codec() { + private final Codec codec = new Elasticsearch92Lucene103Codec() { final ES819TSDBDocValuesFormat docValuesFormat = new ES819TSDBDocValuesFormat( ESTestCase.randomIntBetween(2, 4096), @@ -1388,6 +1391,134 @@ private static BaseSortedDocValues getBaseSortedDocValues(LeafReader leafReader, return (BaseSortedDocValues) sortedDocValues; } + public void testDocIDEndRun() throws IOException { + String timestampField = "@timestamp"; + String hostnameField = "host.name"; + long baseTimestamp = 1704067200000L; + + var config = getTimeSeriesIndexWriterConfig(hostnameField, timestampField); + try (var dir = newDirectory(); var iw = new IndexWriter(dir, config)) { + long counter1 = 0; + + long[] gauge2Values = new long[] { -2, -4, -6, -8, -10, -12, -14, -16 }; + String[] tags = new String[] { "tag_1", "tag_2", "tag_3", "tag_4", "tag_5", "tag_6", "tag_7", "tag_8" }; + + // IndexedDISI stores ids in blocks of 4096. To test sparse end runs, we want a mixture of + // dense and sparse blocks, so we need the gap frequency to be larger than + // this value, but smaller than two blocks, and to index at least three blocks + int gap_frequency = 4500 + random().nextInt(2048); + int numDocs = 10000 + random().nextInt(10000); + int numHosts = numDocs / 20; + + for (int i = 0; i < numDocs; i++) { + var d = new Document(); + + int batchIndex = i / numHosts; + String hostName = String.format(Locale.ROOT, "host-%03d", batchIndex); + long timestamp = baseTimestamp + (1000L * i); + + d.add(new SortedDocValuesField(hostnameField, new BytesRef(hostName))); + // Index sorting doesn't work with NumericDocValuesField: + d.add(new SortedNumericDocValuesField(timestampField, timestamp)); + d.add(new NumericDocValuesField("counter", counter1++)); + if (i % gap_frequency != 0) { + d.add(new NumericDocValuesField("sparse_counter", counter1)); + } + + int numGauge2 = 1 + random().nextInt(8); + for (int j = 0; j < numGauge2; j++) { + d.add(new SortedNumericDocValuesField("gauge", gauge2Values[(i + j) % gauge2Values.length])); + if (i % gap_frequency != 0) { + d.add(new SortedNumericDocValuesField("sparse_gauge", gauge2Values[(i + j) % gauge2Values.length])); + } + } + + d.add(new SortedDocValuesField("tag", new BytesRef(randomFrom(tags)))); + if (i % gap_frequency != 0) { + d.add(new SortedDocValuesField("sparse_tag", new BytesRef(randomFrom(tags)))); + } + + int numTags = 1 + random().nextInt(8); + for (int j = 0; j < numTags; j++) { + d.add(new SortedSetDocValuesField("tags", new BytesRef(tags[(i + j) % tags.length]))); + if (i % gap_frequency != 0) { + d.add(new SortedSetDocValuesField("sparse_tags", new BytesRef(tags[(i + j) % tags.length]))); + } + } + + d.add(new BinaryDocValuesField("tags_as_bytes", new BytesRef(tags[i % tags.length]))); + if (i % gap_frequency != 0) { + d.add(new BinaryDocValuesField("sparse_tags_as_bytes", new BytesRef(tags[i % tags.length]))); + } + + iw.addDocument(d); + if (i % 100 == 0) { + iw.commit(); + } + } + iw.commit(); + + iw.forceMerge(1); + + try (var reader = DirectoryReader.open(iw)) { + assertEquals(1, reader.leaves().size()); + assertEquals(numDocs, reader.maxDoc()); + var leaf = reader.leaves().get(0).reader(); + var hostNameDV = leaf.getSortedDocValues(hostnameField); + assertNotNull(hostNameDV); + validateRunEnd(hostNameDV); + var timestampDV = DocValues.unwrapSingleton(leaf.getSortedNumericDocValues(timestampField)); + assertNotNull(timestampDV); + validateRunEnd(timestampDV); + var counterOneDV = leaf.getNumericDocValues("counter"); + assertNotNull(counterOneDV); + validateRunEnd(counterOneDV); + var sparseCounter = leaf.getNumericDocValues("sparse_counter"); + assertNotNull(sparseCounter); + validateRunEnd(sparseCounter); + var gaugeOneDV = leaf.getSortedNumericDocValues("gauge"); + assertNotNull(gaugeOneDV); + validateRunEnd(gaugeOneDV); + var sparseGaugeDV = leaf.getSortedNumericDocValues("sparse_gauge"); + assertNotNull(sparseGaugeDV); + validateRunEnd(sparseGaugeDV); + var tagDV = leaf.getSortedDocValues("tag"); + assertNotNull(tagDV); + validateRunEnd(tagDV); + var sparseTagDV = leaf.getSortedDocValues("sparse_tag"); + assertNotNull(sparseTagDV); + validateRunEnd(sparseTagDV); + var tagsDV = leaf.getSortedSetDocValues("tags"); + assertNotNull(tagsDV); + validateRunEnd(tagsDV); + var sparseTagsDV = leaf.getSortedSetDocValues("sparse_tags"); + assertNotNull(sparseTagsDV); + validateRunEnd(sparseTagsDV); + var tagBytesDV = leaf.getBinaryDocValues("tags_as_bytes"); + assertNotNull(tagBytesDV); + validateRunEnd(tagBytesDV); + var sparseTagBytesDV = leaf.getBinaryDocValues("sparse_tags_as_bytes"); + assertNotNull(sparseTagBytesDV); + validateRunEnd(sparseTagBytesDV); + } + } + } + + private void validateRunEnd(DocIdSetIterator iterator) throws IOException { + int runCount = 0; + while (iterator.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { + int runLength = iterator.docIDRunEnd() - iterator.docID() - 1; + if (runLength > 1) { + runCount++; + for (int i = 0; i < runLength; i++) { + int expected = iterator.docID() + 1; + assertEquals(expected, iterator.advance(expected)); + } + } + } + assertTrue("Expected docid runs of greater than 1", runCount > 0); + } + private IndexWriterConfig getTimeSeriesIndexWriterConfig(String hostnameField, String timestampField) { return getTimeSeriesIndexWriterConfig(hostnameField, false, timestampField); } diff --git a/server/src/test/java/org/elasticsearch/index/codec/vectors/ES813FlatVectorFormatTests.java b/server/src/test/java/org/elasticsearch/index/codec/vectors/ES813FlatVectorFormatTests.java index 62ee4750abb59..64eab1b544730 100644 --- a/server/src/test/java/org/elasticsearch/index/codec/vectors/ES813FlatVectorFormatTests.java +++ b/server/src/test/java/org/elasticsearch/index/codec/vectors/ES813FlatVectorFormatTests.java @@ -23,7 +23,6 @@ import org.apache.lucene.tests.index.BaseKnnVectorsFormatTestCase; import org.apache.lucene.tests.util.TestUtil; import org.elasticsearch.common.logging.LogConfigurator; -import org.elasticsearch.index.codec.vectors.reflect.OffHeapByteSizeUtils; import java.io.IOException; @@ -62,7 +61,7 @@ public void testSimpleOffHeapSize() throws IOException { knnVectorsReader = fieldsReader.getFieldReader("f"); } var fieldInfo = r.getFieldInfos().fieldInfo("f"); - var offHeap = OffHeapByteSizeUtils.getOffHeapByteSize(knnVectorsReader, fieldInfo); + var offHeap = knnVectorsReader.getOffHeapByteSize(fieldInfo); assertEquals(vector.length * Float.BYTES, (long) offHeap.get("vec")); assertEquals(1, offHeap.size()); } diff --git a/server/src/test/java/org/elasticsearch/index/codec/vectors/ES813Int8FlatVectorFormatTests.java b/server/src/test/java/org/elasticsearch/index/codec/vectors/ES813Int8FlatVectorFormatTests.java index b4cf838d1f86c..a02e7428cbba0 100644 --- a/server/src/test/java/org/elasticsearch/index/codec/vectors/ES813Int8FlatVectorFormatTests.java +++ b/server/src/test/java/org/elasticsearch/index/codec/vectors/ES813Int8FlatVectorFormatTests.java @@ -23,7 +23,6 @@ import org.apache.lucene.tests.index.BaseKnnVectorsFormatTestCase; import org.apache.lucene.tests.util.TestUtil; import org.elasticsearch.common.logging.LogConfigurator; -import org.elasticsearch.index.codec.vectors.reflect.OffHeapByteSizeUtils; import java.io.IOException; @@ -62,7 +61,7 @@ public void testSimpleOffHeapSize() throws IOException { knnVectorsReader = fieldsReader.getFieldReader("f"); } var fieldInfo = r.getFieldInfos().fieldInfo("f"); - var offHeap = OffHeapByteSizeUtils.getOffHeapByteSize(knnVectorsReader, fieldInfo); + var offHeap = knnVectorsReader.getOffHeapByteSize(fieldInfo); assertEquals(2, offHeap.size()); assertEquals(vector.length * Float.BYTES, (long) offHeap.get("vec")); assertTrue(offHeap.get("veq") > 0L); diff --git a/server/src/test/java/org/elasticsearch/index/codec/vectors/ES814HnswScalarQuantizedVectorsFormatTests.java b/server/src/test/java/org/elasticsearch/index/codec/vectors/ES814HnswScalarQuantizedVectorsFormatTests.java index 055bdea16f6a1..fdbf4679e6ab5 100644 --- a/server/src/test/java/org/elasticsearch/index/codec/vectors/ES814HnswScalarQuantizedVectorsFormatTests.java +++ b/server/src/test/java/org/elasticsearch/index/codec/vectors/ES814HnswScalarQuantizedVectorsFormatTests.java @@ -24,12 +24,12 @@ import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.StoredFields; import org.apache.lucene.index.VectorSimilarityFunction; +import org.apache.lucene.search.AcceptDocs; import org.apache.lucene.store.Directory; import org.apache.lucene.store.MMapDirectory; import org.apache.lucene.tests.index.BaseKnnVectorsFormatTestCase; import org.apache.lucene.tests.util.TestUtil; import org.elasticsearch.common.logging.LogConfigurator; -import org.elasticsearch.index.codec.vectors.reflect.OffHeapByteSizeUtils; import java.io.IOException; import java.nio.file.Path; @@ -171,7 +171,13 @@ private void testSingleVectorPerSegment(VectorSimilarityFunction sim) throws Exc LeafReader leafReader = getOnlyLeafReader(reader); StoredFields storedFields = reader.storedFields(); float[] queryVector = new float[] { 0.6f, 0.8f }; - var hits = leafReader.searchNearestVectors("field", queryVector, 3, null, 100); + var hits = leafReader.searchNearestVectors( + "field", + queryVector, + 3, + AcceptDocs.fromLiveDocs(leafReader.getLiveDocs(), leafReader.maxDoc()), + 100 + ); assertEquals(hits.scoreDocs.length, 3); assertEquals("B", storedFields.document(hits.scoreDocs[0].doc).get("id")); assertEquals("A", storedFields.document(hits.scoreDocs[1].doc).get("id")); @@ -195,7 +201,7 @@ public void testSimpleOffHeapSize() throws IOException { knnVectorsReader = fieldsReader.getFieldReader("f"); } var fieldInfo = r.getFieldInfos().fieldInfo("f"); - var offHeap = OffHeapByteSizeUtils.getOffHeapByteSize(knnVectorsReader, fieldInfo); + var offHeap = knnVectorsReader.getOffHeapByteSize(fieldInfo); assertEquals(3, offHeap.size()); assertEquals(vector.length * Float.BYTES, (long) offHeap.get("vec")); assertEquals(1L, (long) offHeap.get("vex")); diff --git a/server/src/test/java/org/elasticsearch/index/codec/vectors/ES815BitFlatVectorFormatTests.java b/server/src/test/java/org/elasticsearch/index/codec/vectors/ES815BitFlatVectorFormatTests.java index 36c20f7a2b7e8..b3d33dcd3cd46 100644 --- a/server/src/test/java/org/elasticsearch/index/codec/vectors/ES815BitFlatVectorFormatTests.java +++ b/server/src/test/java/org/elasticsearch/index/codec/vectors/ES815BitFlatVectorFormatTests.java @@ -22,7 +22,6 @@ import org.apache.lucene.index.VectorSimilarityFunction; import org.apache.lucene.store.Directory; import org.apache.lucene.tests.util.TestUtil; -import org.elasticsearch.index.codec.vectors.reflect.OffHeapByteSizeUtils; import org.junit.Before; import java.io.IOException; @@ -56,7 +55,7 @@ public void testSimpleOffHeapSize() throws IOException { knnVectorsReader = fieldsReader.getFieldReader("f"); } var fieldInfo = r.getFieldInfos().fieldInfo("f"); - var offHeap = OffHeapByteSizeUtils.getOffHeapByteSize(knnVectorsReader, fieldInfo); + var offHeap = knnVectorsReader.getOffHeapByteSize(fieldInfo); assertEquals(1, offHeap.size()); assertTrue(offHeap.get("vec") > 0L); } diff --git a/server/src/test/java/org/elasticsearch/index/codec/vectors/ES815HnswBitVectorsFormatTests.java b/server/src/test/java/org/elasticsearch/index/codec/vectors/ES815HnswBitVectorsFormatTests.java index 8d7d59ef83050..03de893958ab0 100644 --- a/server/src/test/java/org/elasticsearch/index/codec/vectors/ES815HnswBitVectorsFormatTests.java +++ b/server/src/test/java/org/elasticsearch/index/codec/vectors/ES815HnswBitVectorsFormatTests.java @@ -22,7 +22,6 @@ import org.apache.lucene.index.VectorSimilarityFunction; import org.apache.lucene.store.Directory; import org.apache.lucene.tests.util.TestUtil; -import org.elasticsearch.index.codec.vectors.reflect.OffHeapByteSizeUtils; import org.junit.Before; import java.io.IOException; @@ -56,7 +55,7 @@ public void testSimpleOffHeapSize() throws IOException { knnVectorsReader = fieldsReader.getFieldReader("f"); } var fieldInfo = r.getFieldInfos().fieldInfo("f"); - var offHeap = OffHeapByteSizeUtils.getOffHeapByteSize(knnVectorsReader, fieldInfo); + var offHeap = knnVectorsReader.getOffHeapByteSize(fieldInfo); assertEquals(2, offHeap.size()); assertTrue(offHeap.get("vec") > 0L); assertEquals(1L, (long) offHeap.get("vex")); diff --git a/server/src/test/java/org/elasticsearch/index/codec/vectors/diskbbq/ES920DiskBBQVectorsFormatTests.java b/server/src/test/java/org/elasticsearch/index/codec/vectors/diskbbq/ES920DiskBBQVectorsFormatTests.java index eaf6b66c286d1..0507b49968fcb 100644 --- a/server/src/test/java/org/elasticsearch/index/codec/vectors/diskbbq/ES920DiskBBQVectorsFormatTests.java +++ b/server/src/test/java/org/elasticsearch/index/codec/vectors/diskbbq/ES920DiskBBQVectorsFormatTests.java @@ -25,12 +25,12 @@ import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.VectorEncoding; import org.apache.lucene.index.VectorSimilarityFunction; +import org.apache.lucene.search.AcceptDocs; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.Directory; import org.apache.lucene.tests.index.BaseKnnVectorsFormatTestCase; import org.apache.lucene.tests.util.TestUtil; import org.elasticsearch.common.logging.LogConfigurator; -import org.elasticsearch.index.codec.vectors.reflect.OffHeapByteSizeUtils; import org.junit.Before; import java.io.IOException; @@ -43,6 +43,8 @@ import static org.elasticsearch.index.codec.vectors.diskbbq.ES920DiskBBQVectorsFormat.MAX_VECTORS_PER_CLUSTER; import static org.elasticsearch.index.codec.vectors.diskbbq.ES920DiskBBQVectorsFormat.MIN_CENTROIDS_PER_PARENT_CLUSTER; import static org.elasticsearch.index.codec.vectors.diskbbq.ES920DiskBBQVectorsFormat.MIN_VECTORS_PER_CLUSTER; +import static org.hamcrest.Matchers.anEmptyMap; +import static org.hamcrest.Matchers.equalTo; import static org.hamcrest.Matchers.is; import static org.hamcrest.Matchers.oneOf; @@ -99,6 +101,25 @@ protected Codec getCodec() { return TestUtil.alwaysKnnVectorsFormat(format); } + @Override + protected void assertOffHeapByteSize(LeafReader r, String fieldName) throws IOException { + var fieldInfo = r.getFieldInfos().fieldInfo(fieldName); + + if (r instanceof CodecReader codecReader) { + KnnVectorsReader knnVectorsReader = codecReader.getVectorReader(); + if (knnVectorsReader instanceof PerFieldKnnVectorsFormat.FieldsReader fieldsReader) { + knnVectorsReader = fieldsReader.getFieldReader(fieldName); + } + var offHeap = knnVectorsReader.getOffHeapByteSize(fieldInfo); + long totalByteSize = offHeap.values().stream().mapToLong(Long::longValue).sum(); + // IVF doesn't report stats at the moment + assertThat(offHeap, anEmptyMap()); + assertThat(totalByteSize, equalTo(0L)); + } else { + throw new AssertionError("unexpected:" + r.getClass()); + } + } + @Override public void testAdvance() throws Exception { // TODO re-enable with hierarchical IVF, clustering as it is is flaky @@ -140,7 +161,7 @@ public void testSimpleOffHeapSize() throws IOException { knnVectorsReader = fieldsReader.getFieldReader("f"); } var fieldInfo = r.getFieldInfos().fieldInfo("f"); - var offHeap = OffHeapByteSizeUtils.getOffHeapByteSize(knnVectorsReader, fieldInfo); + var offHeap = knnVectorsReader.getOffHeapByteSize(fieldInfo); assertEquals(0, offHeap.size()); } } @@ -171,7 +192,13 @@ public void testFewVectorManyTimes() throws IOException { for (LeafReaderContext r : subReaders) { LeafReader leafReader = r.reader(); float[] vector = randomVector(dimensions); - TopDocs topDocs = leafReader.searchNearestVectors("f", vector, 10, leafReader.getLiveDocs(), Integer.MAX_VALUE); + TopDocs topDocs = leafReader.searchNearestVectors( + "f", + vector, + 10, + AcceptDocs.fromLiveDocs(leafReader.getLiveDocs(), leafReader.maxDoc()), + Integer.MAX_VALUE + ); assertEquals(Math.min(leafReader.maxDoc(), 10), topDocs.scoreDocs.length); } @@ -199,7 +226,13 @@ public void testOneRepeatedVector() throws IOException { for (LeafReaderContext r : subReaders) { LeafReader leafReader = r.reader(); float[] vector = randomVector(dimensions); - TopDocs topDocs = leafReader.searchNearestVectors("f", vector, 10, leafReader.getLiveDocs(), Integer.MAX_VALUE); + TopDocs topDocs = leafReader.searchNearestVectors( + "f", + vector, + 10, + AcceptDocs.fromLiveDocs(leafReader.getLiveDocs(), leafReader.maxDoc()), + Integer.MAX_VALUE + ); assertEquals(Math.min(leafReader.maxDoc(), 10), topDocs.scoreDocs.length); } @@ -230,7 +263,13 @@ public void testWithThreads() throws Exception { for (; totSearch < numSearches && failed.get() == false; totSearch++) { float[] vector = randomVector(dimensions); LeafReader leafReader = getOnlyLeafReader(reader); - leafReader.searchNearestVectors("f", vector, 10, leafReader.getLiveDocs(), Integer.MAX_VALUE); + leafReader.searchNearestVectors( + "f", + vector, + 10, + AcceptDocs.fromLiveDocs(leafReader.getLiveDocs(), leafReader.maxDoc()), + Integer.MAX_VALUE + ); } assertTrue(totSearch > 0); } catch (Exception exc) { diff --git a/server/src/test/java/org/elasticsearch/index/codec/vectors/es816/ES816BinaryQuantizedVectorsFormatTests.java b/server/src/test/java/org/elasticsearch/index/codec/vectors/es816/ES816BinaryQuantizedVectorsFormatTests.java index 75299064d77ad..0fd3eb4cfc3e5 100644 --- a/server/src/test/java/org/elasticsearch/index/codec/vectors/es816/ES816BinaryQuantizedVectorsFormatTests.java +++ b/server/src/test/java/org/elasticsearch/index/codec/vectors/es816/ES816BinaryQuantizedVectorsFormatTests.java @@ -55,7 +55,6 @@ import org.apache.lucene.tests.util.TestUtil; import org.elasticsearch.common.logging.LogConfigurator; import org.elasticsearch.index.codec.vectors.BQVectorUtils; -import org.elasticsearch.index.codec.vectors.reflect.OffHeapByteSizeUtils; import java.io.IOException; import java.util.ArrayList; @@ -262,7 +261,7 @@ public void testSimpleOffHeapSize() throws IOException { knnVectorsReader = fieldsReader.getFieldReader("f"); } var fieldInfo = r.getFieldInfos().fieldInfo("f"); - var offHeap = OffHeapByteSizeUtils.getOffHeapByteSize(knnVectorsReader, fieldInfo); + var offHeap = knnVectorsReader.getOffHeapByteSize(fieldInfo); assertEquals(2, offHeap.size()); assertEquals(vector.length * Float.BYTES, (long) offHeap.get("vec")); assertTrue(offHeap.get("veb") > 0L); diff --git a/server/src/test/java/org/elasticsearch/index/codec/vectors/es816/ES816HnswBinaryQuantizedVectorsFormatTests.java b/server/src/test/java/org/elasticsearch/index/codec/vectors/es816/ES816HnswBinaryQuantizedVectorsFormatTests.java index 5da9ba1080195..c10fa9428bc13 100644 --- a/server/src/test/java/org/elasticsearch/index/codec/vectors/es816/ES816HnswBinaryQuantizedVectorsFormatTests.java +++ b/server/src/test/java/org/elasticsearch/index/codec/vectors/es816/ES816HnswBinaryQuantizedVectorsFormatTests.java @@ -35,13 +35,13 @@ import org.apache.lucene.index.KnnVectorValues; import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.VectorSimilarityFunction; +import org.apache.lucene.search.AcceptDocs; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.Directory; import org.apache.lucene.tests.index.BaseKnnVectorsFormatTestCase; import org.apache.lucene.tests.util.TestUtil; import org.apache.lucene.util.SameThreadExecutorService; import org.elasticsearch.common.logging.LogConfigurator; -import org.elasticsearch.index.codec.vectors.reflect.OffHeapByteSizeUtils; import java.io.IOException; import java.util.Arrays; @@ -100,7 +100,13 @@ public void testSingleVectorCase() throws Exception { while (docIndexIterator.nextDoc() != NO_MORE_DOCS) { assertArrayEquals(vector, vectorValues.vectorValue(docIndexIterator.index()), 0.00001f); } - TopDocs td = r.searchNearestVectors("f", randomVector(vector.length), 1, null, Integer.MAX_VALUE); + TopDocs td = r.searchNearestVectors( + "f", + randomVector(vector.length), + 1, + AcceptDocs.fromLiveDocs(r.getLiveDocs(), r.maxDoc()), + Integer.MAX_VALUE + ); assertEquals(1, td.totalHits.value()); assertTrue(td.scoreDocs[0].score >= 0); } @@ -144,7 +150,7 @@ public void testSimpleOffHeapSize() throws IOException { knnVectorsReader = fieldsReader.getFieldReader("f"); } var fieldInfo = r.getFieldInfos().fieldInfo("f"); - var offHeap = OffHeapByteSizeUtils.getOffHeapByteSize(knnVectorsReader, fieldInfo); + var offHeap = knnVectorsReader.getOffHeapByteSize(fieldInfo); assertEquals(3, offHeap.size()); assertEquals(vector.length * Float.BYTES, (long) offHeap.get("vec")); assertEquals(1L, (long) offHeap.get("vex")); diff --git a/server/src/test/java/org/elasticsearch/index/codec/vectors/es818/ES818BinaryQuantizedVectorsFormatTests.java b/server/src/test/java/org/elasticsearch/index/codec/vectors/es818/ES818BinaryQuantizedVectorsFormatTests.java index 7bd152da6d6c3..540e9f1587cd1 100644 --- a/server/src/test/java/org/elasticsearch/index/codec/vectors/es818/ES818BinaryQuantizedVectorsFormatTests.java +++ b/server/src/test/java/org/elasticsearch/index/codec/vectors/es818/ES818BinaryQuantizedVectorsFormatTests.java @@ -25,7 +25,6 @@ import org.apache.lucene.codecs.FilterCodec; import org.apache.lucene.codecs.KnnVectorsFormat; import org.apache.lucene.codecs.KnnVectorsReader; -import org.apache.lucene.codecs.lucene99.Lucene99FlatVectorsReader; import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; @@ -38,7 +37,6 @@ import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.KnnVectorValues; import org.apache.lucene.index.LeafReader; -import org.apache.lucene.index.SegmentReader; import org.apache.lucene.index.SoftDeletesRetentionMergePolicy; import org.apache.lucene.index.Term; import org.apache.lucene.index.VectorSimilarityFunction; @@ -68,9 +66,7 @@ import org.elasticsearch.index.IndexModule; import org.elasticsearch.index.IndexSettings; import org.elasticsearch.index.codec.vectors.BQVectorUtils; -import org.elasticsearch.index.codec.vectors.MergeReaderWrapper; import org.elasticsearch.index.codec.vectors.OptimizedScalarQuantizer; -import org.elasticsearch.index.codec.vectors.reflect.OffHeapByteSizeUtils; import org.elasticsearch.index.shard.ShardId; import org.elasticsearch.index.shard.ShardPath; import org.elasticsearch.index.store.FsDirectoryFactory; @@ -88,7 +84,6 @@ import static java.lang.String.format; import static org.apache.lucene.index.VectorSimilarityFunction.DOT_PRODUCT; import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS; -import static org.hamcrest.Matchers.instanceOf; import static org.hamcrest.Matchers.is; import static org.hamcrest.Matchers.oneOf; @@ -318,7 +313,7 @@ public void testSimpleOffHeapSizeImpl(Directory dir, IndexWriterConfig config, b knnVectorsReader = fieldsReader.getFieldReader("f"); } var fieldInfo = r.getFieldInfos().fieldInfo("f"); - var offHeap = OffHeapByteSizeUtils.getOffHeapByteSize(knnVectorsReader, fieldInfo); + var offHeap = knnVectorsReader.getOffHeapByteSize(fieldInfo); assertEquals(expectVecOffHeap ? 2 : 1, offHeap.size()); assertTrue(offHeap.get("veb") > 0L); if (expectVecOffHeap) { @@ -329,43 +324,6 @@ public void testSimpleOffHeapSizeImpl(Directory dir, IndexWriterConfig config, b } } - public void testMergeInstance() throws IOException { - checkDirectIOSupported(); - float[] vector = randomVector(10); - VectorSimilarityFunction similarityFunction = randomSimilarity(); - KnnFloatVectorField knnField = new KnnFloatVectorField("field", vector, similarityFunction); - try (Directory dir = newFSDirectory()) { - try (IndexWriter w = new IndexWriter(dir, newIndexWriterConfig().setUseCompoundFile(false))) { - Document doc = new Document(); - knnField.setVectorValue(randomVector(10)); - doc.add(knnField); - w.addDocument(doc); - w.commit(); - - try (IndexReader reader = DirectoryReader.open(w)) { - SegmentReader r = (SegmentReader) getOnlyLeafReader(reader); - assertThat(unwrapRawVectorReader("field", r.getVectorReader()), instanceOf(DirectIOLucene99FlatVectorsReader.class)); - assertThat( - unwrapRawVectorReader("field", r.getVectorReader().getMergeInstance()), - instanceOf(Lucene99FlatVectorsReader.class) - ); - } - } - } - } - - private static KnnVectorsReader unwrapRawVectorReader(String fieldName, KnnVectorsReader knnReader) { - if (knnReader instanceof PerFieldKnnVectorsFormat.FieldsReader perField) { - return unwrapRawVectorReader(fieldName, perField.getFieldReader(fieldName)); - } else if (knnReader instanceof ES818BinaryQuantizedVectorsReader bbqReader) { - return unwrapRawVectorReader(fieldName, bbqReader.getRawVectorsReader()); - } else if (knnReader instanceof MergeReaderWrapper mergeReaderWrapper) { - return unwrapRawVectorReader(fieldName, mergeReaderWrapper.getMainReader()); - } else { - return knnReader; - } - } - static Directory newMMapDirectory() throws IOException { Directory dir = new MMapDirectory(createTempDir("ES818BinaryQuantizedVectorsFormatTests")); if (random().nextBoolean()) { diff --git a/server/src/test/java/org/elasticsearch/index/codec/vectors/es818/ES818HnswBinaryQuantizedVectorsFormatTests.java b/server/src/test/java/org/elasticsearch/index/codec/vectors/es818/ES818HnswBinaryQuantizedVectorsFormatTests.java index e63ae74f625cf..7bc78186fa533 100644 --- a/server/src/test/java/org/elasticsearch/index/codec/vectors/es818/ES818HnswBinaryQuantizedVectorsFormatTests.java +++ b/server/src/test/java/org/elasticsearch/index/codec/vectors/es818/ES818HnswBinaryQuantizedVectorsFormatTests.java @@ -37,6 +37,7 @@ import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.VectorSimilarityFunction; import org.apache.lucene.misc.store.DirectIODirectory; +import org.apache.lucene.search.AcceptDocs; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; @@ -52,7 +53,6 @@ import org.elasticsearch.common.settings.Settings; import org.elasticsearch.index.IndexModule; import org.elasticsearch.index.IndexSettings; -import org.elasticsearch.index.codec.vectors.reflect.OffHeapByteSizeUtils; import org.elasticsearch.index.shard.ShardId; import org.elasticsearch.index.shard.ShardPath; import org.elasticsearch.index.store.FsDirectoryFactory; @@ -126,7 +126,13 @@ public void testSingleVectorCase() throws Exception { VectorUtil.l2normalize(randomVector); } float trueScore = similarityFunction.compare(vector, randomVector); - TopDocs td = r.searchNearestVectors("f", randomVector, 1, null, Integer.MAX_VALUE); + TopDocs td = r.searchNearestVectors( + "f", + randomVector, + 1, + AcceptDocs.fromLiveDocs(r.getLiveDocs(), r.maxDoc()), + Integer.MAX_VALUE + ); assertEquals(1, td.totalHits.value()); assertTrue(td.scoreDocs[0].score >= 0); // When it's the only vector in a segment, the score should be very close to the true score @@ -192,7 +198,7 @@ public void testSimpleOffHeapSizeImpl(Directory dir, IndexWriterConfig config, b knnVectorsReader = fieldsReader.getFieldReader("f"); } var fieldInfo = r.getFieldInfos().fieldInfo("f"); - var offHeap = OffHeapByteSizeUtils.getOffHeapByteSize(knnVectorsReader, fieldInfo); + var offHeap = knnVectorsReader.getOffHeapByteSize(fieldInfo); assertEquals(expectVecOffHeap ? 3 : 2, offHeap.size()); assertEquals(1L, (long) offHeap.get("vex")); assertTrue(offHeap.get("veb") > 0L); diff --git a/server/src/test/java/org/elasticsearch/index/codec/zstd/StoredFieldCodecDuelTests.java b/server/src/test/java/org/elasticsearch/index/codec/zstd/StoredFieldCodecDuelTests.java index 0e5732ec09e5b..34b360d797930 100644 --- a/server/src/test/java/org/elasticsearch/index/codec/zstd/StoredFieldCodecDuelTests.java +++ b/server/src/test/java/org/elasticsearch/index/codec/zstd/StoredFieldCodecDuelTests.java @@ -10,7 +10,7 @@ package org.elasticsearch.index.codec.zstd; import org.apache.lucene.codecs.Codec; -import org.apache.lucene.codecs.lucene101.Lucene101Codec; +import org.apache.lucene.codecs.lucene103.Lucene103Codec; import org.apache.lucene.document.Document; import org.apache.lucene.document.StoredField; import org.apache.lucene.index.DirectoryReader; @@ -35,13 +35,13 @@ public class StoredFieldCodecDuelTests extends ESTestCase { private static final String DOUBLE_FIELD = "double_field_5"; public void testDuelBestSpeed() throws IOException { - var baseline = new LegacyPerFieldMapperCodec(Lucene101Codec.Mode.BEST_SPEED, null, BigArrays.NON_RECYCLING_INSTANCE); + var baseline = new LegacyPerFieldMapperCodec(Lucene103Codec.Mode.BEST_SPEED, null, BigArrays.NON_RECYCLING_INSTANCE); var contender = new PerFieldMapperCodec(Zstd814StoredFieldsFormat.Mode.BEST_SPEED, null, BigArrays.NON_RECYCLING_INSTANCE); doTestDuel(baseline, contender); } public void testDuelBestCompression() throws IOException { - var baseline = new LegacyPerFieldMapperCodec(Lucene101Codec.Mode.BEST_COMPRESSION, null, BigArrays.NON_RECYCLING_INSTANCE); + var baseline = new LegacyPerFieldMapperCodec(Lucene103Codec.Mode.BEST_COMPRESSION, null, BigArrays.NON_RECYCLING_INSTANCE); var contender = new PerFieldMapperCodec(Zstd814StoredFieldsFormat.Mode.BEST_COMPRESSION, null, BigArrays.NON_RECYCLING_INSTANCE); doTestDuel(baseline, contender); } diff --git a/server/src/test/java/org/elasticsearch/index/codec/zstd/Zstd814BestCompressionStoredFieldsFormatTests.java b/server/src/test/java/org/elasticsearch/index/codec/zstd/Zstd814BestCompressionStoredFieldsFormatTests.java index b6fefcb9a4e98..f89fa52256e15 100644 --- a/server/src/test/java/org/elasticsearch/index/codec/zstd/Zstd814BestCompressionStoredFieldsFormatTests.java +++ b/server/src/test/java/org/elasticsearch/index/codec/zstd/Zstd814BestCompressionStoredFieldsFormatTests.java @@ -11,11 +11,11 @@ import org.apache.lucene.codecs.Codec; import org.apache.lucene.tests.index.BaseStoredFieldsFormatTestCase; -import org.elasticsearch.index.codec.Elasticsearch900Lucene101Codec; +import org.elasticsearch.index.codec.Elasticsearch92Lucene103Codec; public class Zstd814BestCompressionStoredFieldsFormatTests extends BaseStoredFieldsFormatTestCase { - private final Codec codec = new Elasticsearch900Lucene101Codec(Zstd814StoredFieldsFormat.Mode.BEST_COMPRESSION); + private final Codec codec = new Elasticsearch92Lucene103Codec(Zstd814StoredFieldsFormat.Mode.BEST_COMPRESSION); @Override protected Codec getCodec() { diff --git a/server/src/test/java/org/elasticsearch/index/codec/zstd/Zstd814BestSpeedStoredFieldsFormatTests.java b/server/src/test/java/org/elasticsearch/index/codec/zstd/Zstd814BestSpeedStoredFieldsFormatTests.java index 98318707f6c4b..f3d120ed185e7 100644 --- a/server/src/test/java/org/elasticsearch/index/codec/zstd/Zstd814BestSpeedStoredFieldsFormatTests.java +++ b/server/src/test/java/org/elasticsearch/index/codec/zstd/Zstd814BestSpeedStoredFieldsFormatTests.java @@ -11,11 +11,11 @@ import org.apache.lucene.codecs.Codec; import org.apache.lucene.tests.index.BaseStoredFieldsFormatTestCase; -import org.elasticsearch.index.codec.Elasticsearch900Lucene101Codec; +import org.elasticsearch.index.codec.Elasticsearch92Lucene103Codec; public class Zstd814BestSpeedStoredFieldsFormatTests extends BaseStoredFieldsFormatTestCase { - private final Codec codec = new Elasticsearch900Lucene101Codec(Zstd814StoredFieldsFormat.Mode.BEST_SPEED); + private final Codec codec = new Elasticsearch92Lucene103Codec(Zstd814StoredFieldsFormat.Mode.BEST_SPEED); @Override protected Codec getCodec() { diff --git a/server/src/test/java/org/elasticsearch/index/mapper/DateFieldTypeTests.java b/server/src/test/java/org/elasticsearch/index/mapper/DateFieldTypeTests.java index 010b618b7626d..735fdd4b9cb3b 100644 --- a/server/src/test/java/org/elasticsearch/index/mapper/DateFieldTypeTests.java +++ b/server/src/test/java/org/elasticsearch/index/mapper/DateFieldTypeTests.java @@ -369,15 +369,12 @@ public void testRangeQuery() throws IOException { Query expected = new IndexOrDocValuesQuery( LongPoint.newRangeQuery("field", instant1, instant2), SortedNumericDocValuesField.newSlowRangeQuery("field", instant1, instant2) - ); + ).rewrite(newSearcher(new MultiReader())); assertEquals(expected, ft.rangeQuery(date1, date2, true, true, null, null, null, context).rewrite(newSearcher(new MultiReader()))); MappedFieldType ft2 = new DateFieldType("field", false); Query expected2 = SortedNumericDocValuesField.newSlowRangeQuery("field", instant1, instant2); - assertEquals( - expected2, - ft2.rangeQuery(date1, date2, true, true, null, null, null, context).rewrite(newSearcher(new MultiReader())) - ); + assertEquals(expected2, ft2.rangeQuery(date1, date2, true, true, null, null, null, context)); instant1 = nowInMillis; instant2 = instant1 + 100; @@ -412,15 +409,12 @@ public void testRangeQuerySubseconds() throws IOException { Query expected = new IndexOrDocValuesQuery( LongPoint.newRangeQuery("field", instant1, instant2), SortedNumericDocValuesField.newSlowRangeQuery("field", instant1, instant2) - ); + ).rewrite(newSearcher(new MultiReader())); assertEquals(expected, ft.rangeQuery(date1, date2, true, true, null, null, null, context).rewrite(newSearcher(new MultiReader()))); MappedFieldType ft2 = new DateFieldType("field", false); Query expected2 = SortedNumericDocValuesField.newSlowRangeQuery("field", instant1, instant2); - assertEquals( - expected2, - ft2.rangeQuery(date1, date2, true, true, null, null, null, context).rewrite(newSearcher(new MultiReader())) - ); + assertEquals(expected2, ft2.rangeQuery(date1, date2, true, true, null, null, null, context)); instant1 = nowInMillis; instant2 = instant1 + 100; @@ -454,12 +448,12 @@ public void testRangeQueryMillis() throws IOException { Query expected = new IndexOrDocValuesQuery( LongPoint.newRangeQuery("field", instant1, instant2), SortedNumericDocValuesField.newSlowRangeQuery("field", instant1, instant2) - ); + ).rewrite(newSearcher(new MultiReader())); assertEquals(expected, ft.rangeQuery(instant1, instant2, true, true, context).rewrite(newSearcher(new MultiReader()))); DateFieldType ft2 = new DateFieldType("field", false); Query expected2 = SortedNumericDocValuesField.newSlowRangeQuery("field", instant1, instant2); - assertEquals(expected2, ft2.rangeQuery(instant1, instant2, true, true, context).rewrite(newSearcher(new MultiReader()))); + assertEquals(expected2, ft2.rangeQuery(instant1, instant2, true, true, context)); assertIndexUnsearchable( Resolution.MILLISECONDS, @@ -480,15 +474,12 @@ public void testRangeQueryNanos() throws IOException { Query expected = new IndexOrDocValuesQuery( LongPoint.newRangeQuery("field", instant1, instant2), SortedNumericDocValuesField.newSlowRangeQuery("field", instant1, instant2) - ); + ).rewrite(newSearcher(new MultiReader())); assertEquals(expected, ft.rangeQuery(instant1, instant2, true, true, context).rewrite(newSearcher(new MultiReader()))); DateFieldType ft2 = new DateFieldType("field", false, Resolution.NANOSECONDS); Query expected2 = SortedNumericDocValuesField.newSlowRangeQuery("field", instant1, instant2); - assertEquals( - expected2, - ft2.rangeQuery(date1, date2, true, true, null, null, null, context).rewrite(newSearcher(new MultiReader())) - ); + assertEquals(expected2, ft2.rangeQuery(date1, date2, true, true, null, null, null, context)); assertIndexUnsearchable(Resolution.NANOSECONDS, (unsearchable) -> unsearchable.rangeQuery(instant1, instant2, true, true, context)); } diff --git a/server/src/test/java/org/elasticsearch/index/query/MatchPhraseQueryBuilderTests.java b/server/src/test/java/org/elasticsearch/index/query/MatchPhraseQueryBuilderTests.java index 4d76f7c522417..3532751359cfe 100644 --- a/server/src/test/java/org/elasticsearch/index/query/MatchPhraseQueryBuilderTests.java +++ b/server/src/test/java/org/elasticsearch/index/query/MatchPhraseQueryBuilderTests.java @@ -11,6 +11,7 @@ import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.IndexOrDocValuesQuery; +import org.apache.lucene.search.IndexSortSortedNumericDocValuesRangeQuery; import org.apache.lucene.search.MatchAllDocsQuery; import org.apache.lucene.search.MatchNoDocsQuery; import org.apache.lucene.search.PhraseQuery; @@ -106,6 +107,7 @@ protected void doAssertLuceneQuery(MatchPhraseQueryBuilder queryBuilder, Query q .or(instanceOf(PointRangeQuery.class)) .or(instanceOf(IndexOrDocValuesQuery.class)) .or(instanceOf(MatchNoDocsQuery.class)) + .or(instanceOf(IndexSortSortedNumericDocValuesRangeQuery.class)) ); } diff --git a/server/src/test/java/org/elasticsearch/index/query/TermQueryBuilderTests.java b/server/src/test/java/org/elasticsearch/index/query/TermQueryBuilderTests.java index bbac216754eed..9d5db1b3c32f8 100644 --- a/server/src/test/java/org/elasticsearch/index/query/TermQueryBuilderTests.java +++ b/server/src/test/java/org/elasticsearch/index/query/TermQueryBuilderTests.java @@ -12,6 +12,8 @@ import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.search.AutomatonQuery; +import org.apache.lucene.search.IndexOrDocValuesQuery; +import org.apache.lucene.search.IndexSortSortedNumericDocValuesRangeQuery; import org.apache.lucene.search.MatchNoDocsQuery; import org.apache.lucene.search.PointRangeQuery; import org.apache.lucene.search.Query; @@ -93,6 +95,8 @@ protected void doAssertLuceneQuery(TermQueryBuilder queryBuilder, Query query, S either(instanceOf(TermQuery.class)).or(instanceOf(PointRangeQuery.class)) .or(instanceOf(MatchNoDocsQuery.class)) .or(instanceOf(AutomatonQuery.class)) + .or(instanceOf(IndexOrDocValuesQuery.class)) + .or(instanceOf(IndexSortSortedNumericDocValuesRangeQuery.class)) ); MappedFieldType mapper = context.getFieldType(queryBuilder.fieldName()); if (query instanceof TermQuery termQuery) { diff --git a/server/src/test/java/org/elasticsearch/index/similarity/ScriptedSimilarityTests.java b/server/src/test/java/org/elasticsearch/index/similarity/ScriptedSimilarityTests.java index fa5f713dfd672..47b1b59306019 100644 --- a/server/src/test/java/org/elasticsearch/index/similarity/ScriptedSimilarityTests.java +++ b/server/src/test/java/org/elasticsearch/index/similarity/ScriptedSimilarityTests.java @@ -188,7 +188,8 @@ public double execute( StackTraceElement[] stackTraceElements = Thread.currentThread().getStackTrace(); if (Arrays.stream(stackTraceElements).anyMatch(ste -> { - return ste.getClassName().endsWith(".TermScorer") && ste.getMethodName().equals("score"); + return ste.getClassName().endsWith(".TermScorer") + && (ste.getMethodName().equals("score") || ste.getMethodName().equals("nextDocsAndScores")); }) == false) { // this might happen when computing max scores return Float.MAX_VALUE; diff --git a/server/src/test/java/org/elasticsearch/index/store/FsDirectoryFactoryTests.java b/server/src/test/java/org/elasticsearch/index/store/FsDirectoryFactoryTests.java index bbb78053e7518..3152513dcc276 100644 --- a/server/src/test/java/org/elasticsearch/index/store/FsDirectoryFactoryTests.java +++ b/server/src/test/java/org/elasticsearch/index/store/FsDirectoryFactoryTests.java @@ -9,16 +9,12 @@ package org.elasticsearch.index.store; import org.apache.lucene.store.AlreadyClosedException; -import org.apache.lucene.store.ByteBuffersDirectory; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FilterDirectory; import org.apache.lucene.store.IOContext; -import org.apache.lucene.store.IndexInput; -import org.apache.lucene.store.IndexOutput; import org.apache.lucene.store.MMapDirectory; import org.apache.lucene.store.NIOFSDirectory; import org.apache.lucene.store.NoLockFactory; -import org.apache.lucene.store.ReadAdvice; import org.apache.lucene.store.SleepingLockWrapper; import org.apache.lucene.util.Constants; import org.elasticsearch.cluster.metadata.IndexMetadata; @@ -38,7 +34,6 @@ import java.nio.file.Path; import java.util.Arrays; import java.util.HashMap; -import java.util.List; import java.util.Locale; import java.util.Map; import java.util.Set; @@ -82,29 +77,6 @@ public void testPreload() throws IOException { } } - public void testDisableRandomAdvice() throws IOException { - Directory dir = new FilterDirectory(new ByteBuffersDirectory()) { - @Override - public IndexInput openInput(String name, IOContext context) throws IOException { - assertFalse(context.readAdvice() == ReadAdvice.RANDOM); - return super.openInput(name, context); - } - }; - Directory noRandomAccessDir = FsDirectoryFactory.disableRandomAdvice(dir); - try (IndexOutput out = noRandomAccessDir.createOutput("foo", IOContext.DEFAULT)) { - out.writeInt(42); - } - // Test the tester - expectThrows(AssertionError.class, () -> dir.openInput("foo", IOContext.DEFAULT.withReadAdvice(ReadAdvice.RANDOM))); - - // The wrapped directory shouldn't fail regardless of the IOContext - for (IOContext context : List.of(IOContext.DEFAULT, IOContext.READONCE, IOContext.DEFAULT.withReadAdvice(ReadAdvice.RANDOM))) { - try (IndexInput in = noRandomAccessDir.openInput("foo", context)) { - assertEquals(42, in.readInt()); - } - } - } - private Directory newDirectory(Settings settings) throws IOException { IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("foo", settings); Path tempDir = createTempDir().resolve(idxSettings.getUUID()).resolve("0"); @@ -119,7 +91,7 @@ static class PreLoadExposingFsDirectoryFactory extends FsDirectoryFactory { final Map> preLoadFuncMap = new HashMap<>(); @Override - public MMapDirectory setPreload(MMapDirectory mMapDirectory, Set preLoadExtensions) { + public MMapDirectory setMMapFunctions(MMapDirectory mMapDirectory, Set preLoadExtensions) { var preLoadFunc = FsDirectoryFactory.getPreloadFunc(preLoadExtensions); mMapDirectory.setPreload(preLoadFunc); preLoadFuncMap.put(mMapDirectory, preLoadFunc); diff --git a/server/src/test/java/org/elasticsearch/search/SearchCancellationTests.java b/server/src/test/java/org/elasticsearch/search/SearchCancellationTests.java index aa2e76f512cc8..329f8806b552c 100644 --- a/server/src/test/java/org/elasticsearch/search/SearchCancellationTests.java +++ b/server/src/test/java/org/elasticsearch/search/SearchCancellationTests.java @@ -21,6 +21,7 @@ import org.apache.lucene.index.PointValues; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.search.AcceptDocs; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.MatchAllDocsQuery; import org.apache.lucene.search.TotalHitCountCollectorManager; @@ -201,7 +202,13 @@ public void testExitableDirectoryReaderVectors() throws IOException { expectThrows(TaskCancelledException.class, () -> leaf.getFloatVectorValues(KNN_FIELD_NAME)); expectThrows( TaskCancelledException.class, - () -> leaf.searchNearestVectors(KNN_FIELD_NAME, new float[] { 1f, 1f, 1f }, 2, leaf.getLiveDocs(), Integer.MAX_VALUE) + () -> leaf.searchNearestVectors( + KNN_FIELD_NAME, + new float[] { 1f, 1f, 1f }, + 2, + AcceptDocs.fromLiveDocs(leaf.getLiveDocs(), leaf.maxDoc()), + Integer.MAX_VALUE + ) ); cancelled.set(false); // Avoid exception during construction of the wrapper objects diff --git a/server/src/test/java/org/elasticsearch/search/aggregations/bucket/composite/CompositeValuesCollectorQueueTests.java b/server/src/test/java/org/elasticsearch/search/aggregations/bucket/composite/CompositeValuesCollectorQueueTests.java index 06600441b0a44..505e4c09aba1a 100644 --- a/server/src/test/java/org/elasticsearch/search/aggregations/bucket/composite/CompositeValuesCollectorQueueTests.java +++ b/server/src/test/java/org/elasticsearch/search/aggregations/bucket/composite/CompositeValuesCollectorQueueTests.java @@ -35,6 +35,7 @@ import org.apache.lucene.index.StoredFields; import org.apache.lucene.index.TermVectors; import org.apache.lucene.index.Terms; +import org.apache.lucene.search.AcceptDocs; import org.apache.lucene.search.CollectionTerminatedException; import org.apache.lucene.search.DocIdSet; import org.apache.lucene.search.KnnCollector; @@ -490,12 +491,13 @@ public ByteVectorValues getByteVectorValues(String field) throws IOException { } @Override - public void searchNearestVectors(String field, float[] target, KnnCollector knnCollector, Bits acceptDocs) throws IOException { + public void searchNearestVectors(String field, float[] target, KnnCollector knnCollector, AcceptDocs acceptDocs) + throws IOException { } @Override - public void searchNearestVectors(String field, byte[] target, KnnCollector knnCollector, Bits acceptDocs) throws IOException { + public void searchNearestVectors(String field, byte[] target, KnnCollector knnCollector, AcceptDocs acceptDocs) throws IOException { } diff --git a/server/src/test/java/org/elasticsearch/search/aggregations/bucket/range/RangeAggregatorTests.java b/server/src/test/java/org/elasticsearch/search/aggregations/bucket/range/RangeAggregatorTests.java index ab92ea8593445..aa7e312f601af 100644 --- a/server/src/test/java/org/elasticsearch/search/aggregations/bucket/range/RangeAggregatorTests.java +++ b/server/src/test/java/org/elasticsearch/search/aggregations/bucket/range/RangeAggregatorTests.java @@ -592,7 +592,7 @@ public void testOverlappingRanges() throws IOException { /** * If the top level query is a runtime field we use the standard aggregator * because it's marginally faster. You'd expect it to be a *ton* faster but - * usually the ranges drive the iteration and they are still fairly fast. + * usually the ranges drive the iteration, and they are still fairly fast. * But the union operation overhead that comes with combining the range with * the top level query tends to slow us down more than the standard aggregator. */ @@ -608,7 +608,8 @@ public void execute() { Query query = new StringScriptFieldTermQuery(new Script("dummy"), scriptFactory, "dummy", "cat", false); debugTestCase(new RangeAggregationBuilder("r").field(NUMBER_FIELD_NAME).addRange(0, 1).addRange(1, 2).addRange(2, 3), query, iw -> { for (int d = 0; d < totalDocs; d++) { - iw.addDocument(List.of(new IntPoint(NUMBER_FIELD_NAME, 0), new SortedNumericDocValuesField(NUMBER_FIELD_NAME, 0))); + int v = d % 2; + iw.addDocument(List.of(new IntPoint(NUMBER_FIELD_NAME, v), new SortedNumericDocValuesField(NUMBER_FIELD_NAME, v))); } }, (InternalRange r, Class impl, Map> debug) -> { assertThat( @@ -619,7 +620,7 @@ public void execute() { assertThat(r.getBuckets().stream().map(InternalRange.Bucket::getTo).collect(toList()), equalTo(List.of(1.0, 2.0, 3.0))); assertThat( r.getBuckets().stream().map(InternalRange.Bucket::getDocCount).collect(toList()), - equalTo(List.of(totalDocs, 0L, 0L)) + equalTo(List.of(totalDocs / 2, totalDocs / 2, 0L)) ); assertThat(impl, equalTo(RangeAggregator.NoOverlap.class)); assertMap( diff --git a/server/src/test/java/org/elasticsearch/search/query/QueryPhaseTests.java b/server/src/test/java/org/elasticsearch/search/query/QueryPhaseTests.java index 0d7f16211aa51..9f2fb0d91a1cc 100644 --- a/server/src/test/java/org/elasticsearch/search/query/QueryPhaseTests.java +++ b/server/src/test/java/org/elasticsearch/search/query/QueryPhaseTests.java @@ -815,7 +815,8 @@ public void testNumericSortOptimization() throws Exception { final SortAndFormats formatsLongDate = new SortAndFormats(sortLongDate, new DocValueFormat[] { DocValueFormat.RAW, dvFormatDate }); final SortAndFormats formatsDateLong = new SortAndFormats(sortDateLong, new DocValueFormat[] { dvFormatDate, DocValueFormat.RAW }); - Query q = LongPoint.newRangeQuery(fieldNameLong, startLongValue, startLongValue + numDocs); + // query all but one doc to avoid optimizations that may rewrite to a MatchAllDocs, which simplifies assertions + Query q = LongPoint.newRangeQuery(fieldNameLong, startLongValue, startLongValue + numDocs - 2); // 1. Test sort optimization on long field try (TestSearchContext searchContext = createContext(newContextSearcher(reader), q)) { @@ -883,7 +884,7 @@ public void testNumericSortOptimization() throws Exception { QueryPhase.addCollectorsAndSearch(searchContext); assertTrue(searchContext.sort().sort.getSort()[0].getOptimizeSortWithPoints()); assertThat(searchContext.queryResult().topDocs().topDocs.scoreDocs, arrayWithSize(0)); - assertThat(searchContext.queryResult().topDocs().topDocs.totalHits.value(), equalTo((long) numDocs)); + assertThat(searchContext.queryResult().topDocs().topDocs.totalHits.value(), equalTo((long) numDocs - 1)); assertThat(searchContext.queryResult().topDocs().topDocs.totalHits.relation(), equalTo(TotalHits.Relation.EQUAL_TO)); } @@ -994,8 +995,7 @@ public void testMinScore() throws Exception { QueryPhase.addCollectorsAndSearch(context); TotalHits totalHits = context.queryResult().topDocs().topDocs.totalHits; assertThat(totalHits.value(), greaterThanOrEqualTo(5L)); - var expectedRelation = totalHits.value() == 10 ? Relation.EQUAL_TO : Relation.GREATER_THAN_OR_EQUAL_TO; - assertThat(totalHits.relation(), is(expectedRelation)); + assertThat(totalHits.relation(), is(Relation.GREATER_THAN_OR_EQUAL_TO)); } } diff --git a/server/src/test/java/org/elasticsearch/search/vectors/RescoreKnnVectorQueryTests.java b/server/src/test/java/org/elasticsearch/search/vectors/RescoreKnnVectorQueryTests.java index c2ba061ad38bc..8e62e18cf02c1 100644 --- a/server/src/test/java/org/elasticsearch/search/vectors/RescoreKnnVectorQueryTests.java +++ b/server/src/test/java/org/elasticsearch/search/vectors/RescoreKnnVectorQueryTests.java @@ -32,7 +32,7 @@ import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.Weight; import org.apache.lucene.store.Directory; -import org.elasticsearch.index.codec.Elasticsearch900Lucene101Codec; +import org.elasticsearch.index.codec.Elasticsearch92Lucene103Codec; import org.elasticsearch.index.codec.vectors.ES813Int8FlatVectorFormat; import org.elasticsearch.index.codec.vectors.ES814HnswScalarQuantizedVectorsFormat; import org.elasticsearch.index.codec.vectors.es818.ES818BinaryQuantizedVectorsFormat; @@ -221,7 +221,7 @@ private static void addRandomDocuments(int numDocs, Directory d, int numDims) th new ES813Int8FlatVectorFormat(), new ES814HnswScalarQuantizedVectorsFormat() ); - iwc.setCodec(new Elasticsearch900Lucene101Codec(randomFrom(Zstd814StoredFieldsFormat.Mode.values())) { + iwc.setCodec(new Elasticsearch92Lucene103Codec(randomFrom(Zstd814StoredFieldsFormat.Mode.values())) { @Override public KnnVectorsFormat getKnnVectorsFormatForField(String field) { return format; diff --git a/x-pack/plugin/core/src/main/java/org/elasticsearch/index/engine/frozen/RewriteCachingDirectoryReader.java b/x-pack/plugin/core/src/main/java/org/elasticsearch/index/engine/frozen/RewriteCachingDirectoryReader.java index 12864dd66a857..65dff572f9037 100644 --- a/x-pack/plugin/core/src/main/java/org/elasticsearch/index/engine/frozen/RewriteCachingDirectoryReader.java +++ b/x-pack/plugin/core/src/main/java/org/elasticsearch/index/engine/frozen/RewriteCachingDirectoryReader.java @@ -26,6 +26,7 @@ import org.apache.lucene.index.StoredFields; import org.apache.lucene.index.TermVectors; import org.apache.lucene.index.Terms; +import org.apache.lucene.search.AcceptDocs; import org.apache.lucene.search.KnnCollector; import org.apache.lucene.store.Directory; import org.apache.lucene.util.Bits; @@ -229,12 +230,12 @@ public ByteVectorValues getByteVectorValues(String field) throws IOException { } @Override - public void searchNearestVectors(String field, float[] target, KnnCollector collector, Bits acceptDocs) throws IOException { + public void searchNearestVectors(String field, float[] target, KnnCollector collector, AcceptDocs acceptDocs) throws IOException { throw new UnsupportedOperationException(); } @Override - public void searchNearestVectors(String field, byte[] target, KnnCollector collector, Bits acceptDocs) throws IOException { + public void searchNearestVectors(String field, byte[] target, KnnCollector collector, AcceptDocs acceptDocs) throws IOException { throw new UnsupportedOperationException(); } diff --git a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/security/authz/accesscontrol/FieldSubsetReader.java b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/security/authz/accesscontrol/FieldSubsetReader.java index bddb274b21226..1b25b36e1d2db 100644 --- a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/security/authz/accesscontrol/FieldSubsetReader.java +++ b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/security/authz/accesscontrol/FieldSubsetReader.java @@ -28,9 +28,9 @@ import org.apache.lucene.index.TermVectors; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.search.AcceptDocs; import org.apache.lucene.search.KnnCollector; import org.apache.lucene.util.ArrayUtil; -import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.FilterIterator; import org.apache.lucene.util.automaton.CharacterRunAutomaton; @@ -313,7 +313,7 @@ public FloatVectorValues getFloatVectorValues(String field) throws IOException { } @Override - public void searchNearestVectors(String field, float[] target, KnnCollector collector, Bits acceptDocs) throws IOException { + public void searchNearestVectors(String field, float[] target, KnnCollector collector, AcceptDocs acceptDocs) throws IOException { if (hasField(field)) { super.searchNearestVectors(field, target, collector, acceptDocs); } @@ -325,7 +325,7 @@ public ByteVectorValues getByteVectorValues(String field) throws IOException { } @Override - public void searchNearestVectors(String field, byte[] target, KnnCollector collector, Bits acceptDocs) throws IOException { + public void searchNearestVectors(String field, byte[] target, KnnCollector collector, AcceptDocs acceptDocs) throws IOException { if (hasField(field)) { super.searchNearestVectors(field, target, collector, acceptDocs); } diff --git a/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/security/authz/accesscontrol/FieldSubsetReaderTests.java b/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/security/authz/accesscontrol/FieldSubsetReaderTests.java index 88a6be2c215a2..23a2792663ec0 100644 --- a/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/security/authz/accesscontrol/FieldSubsetReaderTests.java +++ b/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/security/authz/accesscontrol/FieldSubsetReaderTests.java @@ -46,6 +46,7 @@ import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.index.TermsEnum.SeekStatus; +import org.apache.lucene.search.AcceptDocs; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.Directory; import org.apache.lucene.tests.analysis.MockAnalyzer; @@ -224,13 +225,25 @@ public void testKnnVectors() throws Exception { assertEquals(0, iterator.nextDoc()); assertNotNull(vectorValues.vectorValue(iterator.index())); - TopDocs topDocs = leafReader.searchNearestVectors("fieldA", new float[] { 1.0f, 1.0f, 1.0f }, 5, null, Integer.MAX_VALUE); + TopDocs topDocs = leafReader.searchNearestVectors( + "fieldA", + new float[] { 1.0f, 1.0f, 1.0f }, + 5, + AcceptDocs.fromLiveDocs(leafReader.getLiveDocs(), leafReader.maxDoc()), + Integer.MAX_VALUE + ); assertNotNull(topDocs); assertEquals(1, topDocs.scoreDocs.length); // Check that we can't see fieldB assertNull(leafReader.getFloatVectorValues("fieldB")); - topDocs = leafReader.searchNearestVectors("fieldB", new float[] { 1.0f, 1.0f, 1.0f }, 5, null, Integer.MAX_VALUE); + topDocs = leafReader.searchNearestVectors( + "fieldB", + new float[] { 1.0f, 1.0f, 1.0f }, + 5, + AcceptDocs.fromLiveDocs(leafReader.getLiveDocs(), leafReader.maxDoc()), + Integer.MAX_VALUE + ); assertEquals(0, topDocs.totalHits.value()); assertEquals(0, topDocs.scoreDocs.length); @@ -263,13 +276,25 @@ public void testKnnByteVectors() throws Exception { assertEquals(0, iterator.nextDoc()); assertNotNull(vectorValues.vectorValue(iterator.index())); - TopDocs topDocs = leafReader.searchNearestVectors("fieldA", new byte[] { 1, 1, 1 }, 5, null, Integer.MAX_VALUE); + TopDocs topDocs = leafReader.searchNearestVectors( + "fieldA", + new byte[] { 1, 1, 1 }, + 5, + AcceptDocs.fromLiveDocs(leafReader.getLiveDocs(), leafReader.maxDoc()), + Integer.MAX_VALUE + ); assertNotNull(topDocs); assertEquals(1, topDocs.scoreDocs.length); // Check that we can't see fieldB assertNull(leafReader.getByteVectorValues("fieldB")); - topDocs = leafReader.searchNearestVectors("fieldB", new byte[] { 1, 1, 1 }, 5, null, Integer.MAX_VALUE); + topDocs = leafReader.searchNearestVectors( + "fieldB", + new byte[] { 1, 1, 1 }, + 5, + AcceptDocs.fromLiveDocs(leafReader.getLiveDocs(), leafReader.maxDoc()), + Integer.MAX_VALUE + ); assertEquals(0, topDocs.totalHits.value()); assertEquals(0, topDocs.scoreDocs.length); diff --git a/x-pack/plugin/esql/compute/src/test/java/org/elasticsearch/compute/lucene/LuceneSliceQueueTests.java b/x-pack/plugin/esql/compute/src/test/java/org/elasticsearch/compute/lucene/LuceneSliceQueueTests.java index 6054401fb58d5..79f1fcc86e2c3 100644 --- a/x-pack/plugin/esql/compute/src/test/java/org/elasticsearch/compute/lucene/LuceneSliceQueueTests.java +++ b/x-pack/plugin/esql/compute/src/test/java/org/elasticsearch/compute/lucene/LuceneSliceQueueTests.java @@ -23,6 +23,7 @@ import org.apache.lucene.index.StoredFields; import org.apache.lucene.index.TermVectors; import org.apache.lucene.index.Terms; +import org.apache.lucene.search.AcceptDocs; import org.apache.lucene.search.KnnCollector; import org.apache.lucene.util.Bits; import org.elasticsearch.common.util.concurrent.ConcurrentCollections; @@ -303,12 +304,13 @@ public ByteVectorValues getByteVectorValues(String field) throws IOException { } @Override - public void searchNearestVectors(String field, float[] target, KnnCollector knnCollector, Bits acceptDocs) throws IOException { + public void searchNearestVectors(String field, float[] target, KnnCollector knnCollector, AcceptDocs acceptDocs) + throws IOException { throw new UnsupportedOperationException(); } @Override - public void searchNearestVectors(String field, byte[] target, KnnCollector knnCollector, Bits acceptDocs) throws IOException { + public void searchNearestVectors(String field, byte[] target, KnnCollector knnCollector, AcceptDocs acceptDocs) throws IOException { throw new UnsupportedOperationException(); } diff --git a/x-pack/plugin/esql/compute/src/test/java/org/elasticsearch/compute/lucene/read/ValuesSourceReaderOperatorTests.java b/x-pack/plugin/esql/compute/src/test/java/org/elasticsearch/compute/lucene/read/ValuesSourceReaderOperatorTests.java index 3291ef4b5a2e2..2ff63697f19b1 100644 --- a/x-pack/plugin/esql/compute/src/test/java/org/elasticsearch/compute/lucene/read/ValuesSourceReaderOperatorTests.java +++ b/x-pack/plugin/esql/compute/src/test/java/org/elasticsearch/compute/lucene/read/ValuesSourceReaderOperatorTests.java @@ -178,7 +178,7 @@ protected SourceOperator simpleInput(BlockFactory blockFactory, int size) { } private int commitEvery(int numDocs) { - return Math.max(1, (int) Math.ceil((double) numDocs / 10)); + return Math.max(1, (int) Math.ceil((double) numDocs / 8)); } private SourceOperator simpleInput(DriverContext context, int size, int commitEvery, int pageSize) { @@ -723,8 +723,8 @@ private void testLoadAllStatus(boolean allInOnePage) { DriverContext driverContext = driverContext(); int numDocs = between(100, 5000); List input = CannedSourceOperator.collectPages(simpleInput(driverContext, numDocs, commitEvery(numDocs), numDocs)); - assertThat(reader.leaves(), hasSize(10)); - assertThat(input, hasSize(10)); + assertThat(reader.leaves(), hasSize(8)); + assertThat(input, hasSize(8)); List cases = infoAndChecksForEachType( Block.MvOrdering.DEDUPLICATED_AND_SORTED_ASCENDING, Block.MvOrdering.DEDUPLICATED_AND_SORTED_ASCENDING @@ -943,7 +943,7 @@ private void testLoadLong(boolean shuffle, boolean manySegments) throws IOExcept DriverContext driverContext = driverContext(); List input = CannedSourceOperator.collectPages(sourceOperator(driverContext, numDocs)); - assertThat(reader.leaves(), hasSize(manySegments ? greaterThan(5) : equalTo(1))); + assertThat(reader.leaves(), hasSize(manySegments ? greaterThan(1) : equalTo(1))); assertThat(input, hasSize(reader.leaves().size())); if (manySegments) { input = List.of(CannedSourceOperator.mergePages(input)); diff --git a/x-pack/plugin/esql/qa/server/single-node/src/javaRestTest/java/org/elasticsearch/xpack/esql/qa/single_node/PushQueriesIT.java b/x-pack/plugin/esql/qa/server/single-node/src/javaRestTest/java/org/elasticsearch/xpack/esql/qa/single_node/PushQueriesIT.java index b4e34befa1dbc..a82ecd16334ac 100644 --- a/x-pack/plugin/esql/qa/server/single-node/src/javaRestTest/java/org/elasticsearch/xpack/esql/qa/single_node/PushQueriesIT.java +++ b/x-pack/plugin/esql/qa/server/single-node/src/javaRestTest/java/org/elasticsearch/xpack/esql/qa/single_node/PushQueriesIT.java @@ -151,9 +151,10 @@ public void testEqualityOrOther() throws IOException { FROM test | WHERE test == "%value" OR foo == 2 """; + // query rewrite optimizations apply to foo, since it's query value is always outside the range of indexed values String luceneQuery = switch (type) { - case AUTO, TEXT_WITH_KEYWORD -> "(#test.keyword:%value -_ignored:test.keyword) foo:[2 TO 2]"; - case KEYWORD -> "test:%value foo:[2 TO 2]"; + case AUTO, TEXT_WITH_KEYWORD -> "#test.keyword:%value -_ignored:test.keyword"; + case KEYWORD -> "test:%value"; case CONSTANT_KEYWORD, MATCH_ONLY_TEXT_WITH_KEYWORD -> "*:*"; case SEMANTIC_TEXT_WITH_KEYWORD -> "FieldExistsQuery [field=_primary_term]"; }; @@ -170,16 +171,17 @@ public void testEqualityAndOther() throws IOException { FROM test | WHERE test == "%value" AND foo == 1 """; + // query rewrite optimizations apply to foo, since it's query value is always within the range of indexed values List luceneQueryOptions = switch (type) { - case AUTO, TEXT_WITH_KEYWORD -> List.of("#test.keyword:%value -_ignored:test.keyword #foo:[1 TO 1]"); - case KEYWORD -> List.of("#test:%value #foo:[1 TO 1]"); - case CONSTANT_KEYWORD, MATCH_ONLY_TEXT_WITH_KEYWORD -> List.of("foo:[1 TO 1]"); + case AUTO, TEXT_WITH_KEYWORD -> List.of("#test.keyword:%value -_ignored:test.keyword"); + case KEYWORD -> List.of("test:%value"); + case CONSTANT_KEYWORD, MATCH_ONLY_TEXT_WITH_KEYWORD -> List.of("*:*"); case SEMANTIC_TEXT_WITH_KEYWORD -> /* * single_value_match is here because there are extra documents hiding in the index * that don't have the `foo` field. */ - List.of("#foo:[1 TO 1] #single_value_match(foo)", "foo:[1 TO 1]"); + List.of("#FieldExistsQuery [field=foo] #single_value_match(foo)", "foo:[1 TO 1]"); }; ComputeSignature dataNodeSignature = switch (type) { case AUTO, CONSTANT_KEYWORD, KEYWORD, TEXT_WITH_KEYWORD -> ComputeSignature.FILTER_IN_QUERY; diff --git a/x-pack/plugin/searchable-snapshots/src/main/java/org/elasticsearch/xpack/searchablesnapshots/store/SearchableSnapshotDirectory.java b/x-pack/plugin/searchable-snapshots/src/main/java/org/elasticsearch/xpack/searchablesnapshots/store/SearchableSnapshotDirectory.java index d62443e492605..7c229381a5a7c 100644 --- a/x-pack/plugin/searchable-snapshots/src/main/java/org/elasticsearch/xpack/searchablesnapshots/store/SearchableSnapshotDirectory.java +++ b/x-pack/plugin/searchable-snapshots/src/main/java/org/elasticsearch/xpack/searchablesnapshots/store/SearchableSnapshotDirectory.java @@ -377,7 +377,7 @@ public IndexInput openInput(final String name, final IOContext context) throws I final BytesRef content = fileInfo.metadata().hash(); return new ByteArrayIndexInput("ByteArrayIndexInput(" + name + ')', content.bytes, content.offset, content.length); } - if (context == Store.READONCE_CHECKSUM) { + if (context.hints().contains(Store.FileFooterOnly.INSTANCE)) { return ChecksumBlobContainerIndexInput.create(fileInfo.physicalName(), fileInfo.length(), fileInfo.checksum(), context); } diff --git a/x-pack/plugin/searchable-snapshots/src/main/java/org/elasticsearch/xpack/searchablesnapshots/store/input/CachedBlobContainerIndexInput.java b/x-pack/plugin/searchable-snapshots/src/main/java/org/elasticsearch/xpack/searchablesnapshots/store/input/CachedBlobContainerIndexInput.java index 4711043fff281..f82cba61013df 100644 --- a/x-pack/plugin/searchable-snapshots/src/main/java/org/elasticsearch/xpack/searchablesnapshots/store/input/CachedBlobContainerIndexInput.java +++ b/x-pack/plugin/searchable-snapshots/src/main/java/org/elasticsearch/xpack/searchablesnapshots/store/input/CachedBlobContainerIndexInput.java @@ -10,9 +10,9 @@ import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.apache.lucene.store.IOContext; -import org.apache.lucene.store.ReadAdvice; import org.elasticsearch.blobcache.BlobCacheUtils; import org.elasticsearch.blobcache.common.ByteRange; +import org.elasticsearch.index.StandardIOBehaviorHint; import org.elasticsearch.index.snapshots.blobstore.BlobStoreIndexShardSnapshot.FileInfo; import org.elasticsearch.xpack.searchablesnapshots.cache.common.CacheFile; import org.elasticsearch.xpack.searchablesnapshots.store.IndexInputStats; @@ -36,7 +36,7 @@ public class CachedBlobContainerIndexInput extends MetadataCachingIndexInput { * a complete part of the {@link #fileInfo} at once in the cache and should not be * used for anything else than what the {@link #prefetchPart(int, Supplier)} method does. */ - public static final IOContext CACHE_WARMING_CONTEXT = new IOContext(IOContext.Context.DEFAULT, null, null, ReadAdvice.NORMAL); + public static final IOContext CACHE_WARMING_CONTEXT = IOContext.DEFAULT.withHints(StandardIOBehaviorHint.INSTANCE); private static final Logger logger = LogManager.getLogger(CachedBlobContainerIndexInput.class); @@ -102,7 +102,7 @@ private CachedBlobContainerIndexInput( @Override protected void readWithoutBlobCache(ByteBuffer b) throws Exception { - ensureContext(ctx -> ctx != CACHE_WARMING_CONTEXT); + ensureContext(ctx -> ctx.hints().contains(StandardIOBehaviorHint.INSTANCE) == false); final long position = getAbsolutePosition(); final int length = b.remaining(); @@ -139,7 +139,7 @@ public long getPersistentCacheInitialLength() throws Exception { * or {@code -1} if the prewarming was cancelled */ public long prefetchPart(final int part, Supplier isCancelled) throws IOException { - ensureContext(ctx -> ctx == CACHE_WARMING_CONTEXT); + ensureContext(ctx -> ctx.hints().contains(StandardIOBehaviorHint.INSTANCE)); if (part >= fileInfo.numberOfParts()) { throw new IllegalArgumentException("Unexpected part number [" + part + "]"); } diff --git a/x-pack/plugin/searchable-snapshots/src/main/java/org/elasticsearch/xpack/searchablesnapshots/store/input/ChecksumBlobContainerIndexInput.java b/x-pack/plugin/searchable-snapshots/src/main/java/org/elasticsearch/xpack/searchablesnapshots/store/input/ChecksumBlobContainerIndexInput.java index 552c65a6f2550..4636090158988 100644 --- a/x-pack/plugin/searchable-snapshots/src/main/java/org/elasticsearch/xpack/searchablesnapshots/store/input/ChecksumBlobContainerIndexInput.java +++ b/x-pack/plugin/searchable-snapshots/src/main/java/org/elasticsearch/xpack/searchablesnapshots/store/input/ChecksumBlobContainerIndexInput.java @@ -115,7 +115,7 @@ private int checksumPositionOrThrow(long pos) { } private static void ensureReadOnceChecksumContext(IOContext context) { - if (context != Store.READONCE_CHECKSUM) { + if (context.hints().contains(Store.FileFooterOnly.INSTANCE) == false) { assert false : "expected READONCE_CHECKSUM but got " + context; throw new IllegalArgumentException("ChecksumBlobContainerIndexInput should only be used with READONCE_CHECKSUM context"); }