diff --git a/benchmarks/src/main/java/org/elasticsearch/benchmark/index/codec/tsdb/TSDBDocValuesMergeBenchmark.java b/benchmarks/src/main/java/org/elasticsearch/benchmark/index/codec/tsdb/TSDBDocValuesMergeBenchmark.java index 71164e35ad557..d0b72cd030609 100644 --- a/benchmarks/src/main/java/org/elasticsearch/benchmark/index/codec/tsdb/TSDBDocValuesMergeBenchmark.java +++ b/benchmarks/src/main/java/org/elasticsearch/benchmark/index/codec/tsdb/TSDBDocValuesMergeBenchmark.java @@ -26,7 +26,7 @@ import org.apache.lucene.util.BytesRef; import org.elasticsearch.cluster.metadata.DataStream; import org.elasticsearch.common.logging.LogConfigurator; -import org.elasticsearch.index.codec.Elasticsearch900Lucene101Codec; +import org.elasticsearch.index.codec.Elasticsearch92Lucene103Codec; import org.elasticsearch.index.codec.tsdb.es819.ES819TSDBDocValuesFormat; import org.openjdk.jmh.annotations.Benchmark; import org.openjdk.jmh.annotations.BenchmarkMode; @@ -63,7 +63,6 @@ public class TSDBDocValuesMergeBenchmark { static { - // For Elasticsearch900Lucene101Codec: LogConfigurator.loadLog4jPlugins(); LogConfigurator.configureESLogging(); LogConfigurator.setNodeName("test"); @@ -259,7 +258,7 @@ private static IndexWriterConfig createIndexWriterConfig(boolean optimizedMergeE config.setLeafSorter(DataStream.TIMESERIES_LEAF_READERS_SORTER); config.setMergePolicy(new LogByteSizeMergePolicy()); var docValuesFormat = new ES819TSDBDocValuesFormat(4096, optimizedMergeEnabled); - config.setCodec(new Elasticsearch900Lucene101Codec() { + config.setCodec(new Elasticsearch92Lucene103Codec() { @Override public DocValuesFormat getDocValuesFormatForField(String field) { diff --git a/server/src/internalClusterTest/java/org/elasticsearch/search/simple/SimpleSearchIT.java b/server/src/internalClusterTest/java/org/elasticsearch/search/simple/SimpleSearchIT.java index 5a9be73d92268..dc18835460d20 100644 --- a/server/src/internalClusterTest/java/org/elasticsearch/search/simple/SimpleSearchIT.java +++ b/server/src/internalClusterTest/java/org/elasticsearch/search/simple/SimpleSearchIT.java @@ -261,10 +261,12 @@ public void testSimpleTerminateAfterCount() throws Exception { ensureGreen(); refresh(); - for (int i = 1; i < max; i++) { + // query all but one doc to avoid optimizations that may rewrite to a MatchAllDocs, which simplifies assertions + final int queryMax = max - 1; + for (int i = 1; i < queryMax; i++) { final int finalI = i; assertResponse( - prepareSearch("test").setQuery(QueryBuilders.rangeQuery("field").gte(1).lte(max)).setTerminateAfter(i), + prepareSearch("test").setQuery(QueryBuilders.rangeQuery("field").gte(1).lte(queryMax)).setTerminateAfter(i), response -> { assertHitCount(response, finalI); assertTrue(response.isTerminatedEarly()); @@ -272,9 +274,9 @@ public void testSimpleTerminateAfterCount() throws Exception { ); } assertResponse( - prepareSearch("test").setQuery(QueryBuilders.rangeQuery("field").gte(1).lte(max)).setTerminateAfter(2 * max), + prepareSearch("test").setQuery(QueryBuilders.rangeQuery("field").gte(1).lte(queryMax)).setTerminateAfter(2 * max), response -> { - assertHitCount(response, max); + assertHitCount(response, queryMax); assertFalse(response.isTerminatedEarly()); } ); diff --git a/server/src/main/java/module-info.java b/server/src/main/java/module-info.java index 7edefe9fae581..20df7f456d109 100644 --- a/server/src/main/java/module-info.java +++ b/server/src/main/java/module-info.java @@ -461,7 +461,8 @@ org.elasticsearch.index.codec.Elasticsearch814Codec, org.elasticsearch.index.codec.Elasticsearch816Codec, org.elasticsearch.index.codec.Elasticsearch900Codec, - org.elasticsearch.index.codec.Elasticsearch900Lucene101Codec; + org.elasticsearch.index.codec.Elasticsearch900Lucene101Codec, + org.elasticsearch.index.codec.Elasticsearch92Lucene103Codec; provides org.apache.logging.log4j.core.util.ContextDataProvider with org.elasticsearch.common.logging.DynamicContextDataProvider; diff --git a/server/src/main/java/org/elasticsearch/action/admin/indices/diskusage/IndexDiskUsageAnalyzer.java b/server/src/main/java/org/elasticsearch/action/admin/indices/diskusage/IndexDiskUsageAnalyzer.java index daae5af9127f1..9d2595732c585 100644 --- a/server/src/main/java/org/elasticsearch/action/admin/indices/diskusage/IndexDiskUsageAnalyzer.java +++ b/server/src/main/java/org/elasticsearch/action/admin/indices/diskusage/IndexDiskUsageAnalyzer.java @@ -10,6 +10,7 @@ package org.elasticsearch.action.admin.indices.diskusage; import org.apache.logging.log4j.Logger; +import org.apache.lucene.backward_codecs.lucene101.Lucene101PostingsFormat; import org.apache.lucene.backward_codecs.lucene50.Lucene50PostingsFormat; import org.apache.lucene.backward_codecs.lucene84.Lucene84PostingsFormat; import org.apache.lucene.backward_codecs.lucene90.Lucene90PostingsFormat; @@ -22,7 +23,7 @@ import org.apache.lucene.codecs.PointsReader; import org.apache.lucene.codecs.StoredFieldsReader; import org.apache.lucene.codecs.TermVectorsReader; -import org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat; +import org.apache.lucene.codecs.lucene103.Lucene103PostingsFormat; import org.apache.lucene.index.BinaryDocValues; import org.apache.lucene.index.ByteVectorValues; import org.apache.lucene.index.DirectoryReader; @@ -318,6 +319,9 @@ private static void readProximity(Terms terms, PostingsEnum postings) throws IOE private static BlockTermState getBlockTermState(TermsEnum termsEnum, BytesRef term) throws IOException { if (term != null && termsEnum.seekExact(term)) { final TermState termState = termsEnum.termState(); + if (termState instanceof final Lucene103PostingsFormat.IntBlockTermState blockTermState) { + return new BlockTermState(blockTermState.docStartFP, blockTermState.posStartFP, blockTermState.payStartFP); + } if (termState instanceof final Lucene101PostingsFormat.IntBlockTermState blockTermState) { return new BlockTermState(blockTermState.docStartFP, blockTermState.posStartFP, blockTermState.payStartFP); } diff --git a/server/src/main/java/org/elasticsearch/common/lucene/Lucene.java b/server/src/main/java/org/elasticsearch/common/lucene/Lucene.java index 99ed0917b12bf..09f61d60c0b27 100644 --- a/server/src/main/java/org/elasticsearch/common/lucene/Lucene.java +++ b/server/src/main/java/org/elasticsearch/common/lucene/Lucene.java @@ -93,7 +93,7 @@ public class Lucene { - public static final String LATEST_CODEC = "Lucene101"; + public static final String LATEST_CODEC = "Lucene103"; public static final String SOFT_DELETES_FIELD = "__soft_deletes"; diff --git a/server/src/main/java/org/elasticsearch/index/codec/CodecService.java b/server/src/main/java/org/elasticsearch/index/codec/CodecService.java index 5f887b2b594d3..17028137b78d8 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/CodecService.java +++ b/server/src/main/java/org/elasticsearch/index/codec/CodecService.java @@ -12,7 +12,7 @@ import org.apache.lucene.codecs.Codec; import org.apache.lucene.codecs.FieldInfosFormat; import org.apache.lucene.codecs.FilterCodec; -import org.apache.lucene.codecs.lucene101.Lucene101Codec; +import org.apache.lucene.codecs.lucene103.Lucene103Codec; import org.elasticsearch.common.util.BigArrays; import org.elasticsearch.common.util.FeatureFlag; import org.elasticsearch.core.Nullable; @@ -46,7 +46,7 @@ public class CodecService implements CodecProvider { public CodecService(@Nullable MapperService mapperService, BigArrays bigArrays) { final var codecs = new HashMap(); - Codec legacyBestSpeedCodec = new LegacyPerFieldMapperCodec(Lucene101Codec.Mode.BEST_SPEED, mapperService, bigArrays); + Codec legacyBestSpeedCodec = new LegacyPerFieldMapperCodec(Lucene103Codec.Mode.BEST_SPEED, mapperService, bigArrays); if (ZSTD_STORED_FIELDS_FEATURE_FLAG) { codecs.put(DEFAULT_CODEC, new PerFieldMapperCodec(Zstd814StoredFieldsFormat.Mode.BEST_SPEED, mapperService, bigArrays)); } else { @@ -58,7 +58,7 @@ public CodecService(@Nullable MapperService mapperService, BigArrays bigArrays) BEST_COMPRESSION_CODEC, new PerFieldMapperCodec(Zstd814StoredFieldsFormat.Mode.BEST_COMPRESSION, mapperService, bigArrays) ); - Codec legacyBestCompressionCodec = new LegacyPerFieldMapperCodec(Lucene101Codec.Mode.BEST_COMPRESSION, mapperService, bigArrays); + Codec legacyBestCompressionCodec = new LegacyPerFieldMapperCodec(Lucene103Codec.Mode.BEST_COMPRESSION, mapperService, bigArrays); codecs.put(LEGACY_BEST_COMPRESSION_CODEC, legacyBestCompressionCodec); codecs.put(LUCENE_DEFAULT_CODEC, Codec.getDefault()); diff --git a/server/src/main/java/org/elasticsearch/index/codec/Elasticsearch900Lucene101Codec.java b/server/src/main/java/org/elasticsearch/index/codec/Elasticsearch900Lucene101Codec.java index d96495fb0f615..c0f633bf62268 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/Elasticsearch900Lucene101Codec.java +++ b/server/src/main/java/org/elasticsearch/index/codec/Elasticsearch900Lucene101Codec.java @@ -9,12 +9,12 @@ package org.elasticsearch.index.codec; +import org.apache.lucene.backward_codecs.lucene101.Lucene101Codec; +import org.apache.lucene.backward_codecs.lucene101.Lucene101PostingsFormat; import org.apache.lucene.codecs.DocValuesFormat; import org.apache.lucene.codecs.KnnVectorsFormat; import org.apache.lucene.codecs.PostingsFormat; import org.apache.lucene.codecs.StoredFieldsFormat; -import org.apache.lucene.codecs.lucene101.Lucene101Codec; -import org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat; import org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat; import org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat; import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat; diff --git a/server/src/main/java/org/elasticsearch/index/codec/Elasticsearch92Lucene103Codec.java b/server/src/main/java/org/elasticsearch/index/codec/Elasticsearch92Lucene103Codec.java new file mode 100644 index 0000000000000..c26d485fc8c99 --- /dev/null +++ b/server/src/main/java/org/elasticsearch/index/codec/Elasticsearch92Lucene103Codec.java @@ -0,0 +1,133 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.index.codec; + +import org.apache.lucene.codecs.DocValuesFormat; +import org.apache.lucene.codecs.KnnVectorsFormat; +import org.apache.lucene.codecs.PostingsFormat; +import org.apache.lucene.codecs.StoredFieldsFormat; +import org.apache.lucene.codecs.lucene103.Lucene103Codec; +import org.apache.lucene.codecs.lucene103.Lucene103PostingsFormat; +import org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat; +import org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat; +import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat; +import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat; +import org.elasticsearch.index.codec.perfield.XPerFieldDocValuesFormat; +import org.elasticsearch.index.codec.zstd.Zstd814StoredFieldsFormat; + +/** + * Elasticsearch codec as of 9.2 relying on Lucene 10.3. This extends the Lucene 10.3 codec to compressed + * stored fields with ZSTD instead of LZ4/DEFLATE. See {@link Zstd814StoredFieldsFormat}. + */ +public class Elasticsearch92Lucene103Codec extends CodecService.DeduplicateFieldInfosCodec { + + static final PostingsFormat DEFAULT_POSTINGS_FORMAT = new Lucene103PostingsFormat(); + + private final StoredFieldsFormat storedFieldsFormat; + + private final PostingsFormat defaultPostingsFormat; + private final PostingsFormat postingsFormat = new PerFieldPostingsFormat() { + @Override + public PostingsFormat getPostingsFormatForField(String field) { + return Elasticsearch92Lucene103Codec.this.getPostingsFormatForField(field); + } + }; + + private final DocValuesFormat defaultDVFormat; + private final DocValuesFormat docValuesFormat = new XPerFieldDocValuesFormat() { + @Override + public DocValuesFormat getDocValuesFormatForField(String field) { + return Elasticsearch92Lucene103Codec.this.getDocValuesFormatForField(field); + } + }; + + private final KnnVectorsFormat defaultKnnVectorsFormat; + private final KnnVectorsFormat knnVectorsFormat = new PerFieldKnnVectorsFormat() { + @Override + public KnnVectorsFormat getKnnVectorsFormatForField(String field) { + return Elasticsearch92Lucene103Codec.this.getKnnVectorsFormatForField(field); + } + }; + + /** Public no-arg constructor, needed for SPI loading at read-time. */ + public Elasticsearch92Lucene103Codec() { + this(Zstd814StoredFieldsFormat.Mode.BEST_SPEED); + } + + /** + * Constructor. Takes a {@link Zstd814StoredFieldsFormat.Mode} that describes whether to optimize for retrieval speed at the expense of + * worse space-efficiency or vice-versa. + */ + public Elasticsearch92Lucene103Codec(Zstd814StoredFieldsFormat.Mode mode) { + super("Elasticsearch92Lucene103", new Lucene103Codec()); + this.storedFieldsFormat = mode.getFormat(); + this.defaultPostingsFormat = DEFAULT_POSTINGS_FORMAT; + this.defaultDVFormat = new Lucene90DocValuesFormat(); + this.defaultKnnVectorsFormat = new Lucene99HnswVectorsFormat(); + } + + @Override + public StoredFieldsFormat storedFieldsFormat() { + return storedFieldsFormat; + } + + @Override + public final PostingsFormat postingsFormat() { + return postingsFormat; + } + + @Override + public final DocValuesFormat docValuesFormat() { + return docValuesFormat; + } + + @Override + public final KnnVectorsFormat knnVectorsFormat() { + return knnVectorsFormat; + } + + /** + * Returns the postings format that should be used for writing new segments of field. + * + *

The default implementation always returns "Lucene912". + * + *

WARNING: if you subclass, you are responsible for index backwards compatibility: + * future version of Lucene are only guaranteed to be able to read the default implementation, + */ + public PostingsFormat getPostingsFormatForField(String field) { + return defaultPostingsFormat; + } + + /** + * Returns the docvalues format that should be used for writing new segments of field + * . + * + *

The default implementation always returns "Lucene912". + * + *

WARNING: if you subclass, you are responsible for index backwards compatibility: + * future version of Lucene are only guaranteed to be able to read the default implementation. + */ + public DocValuesFormat getDocValuesFormatForField(String field) { + return defaultDVFormat; + } + + /** + * Returns the vectors format that should be used for writing new segments of field + * + *

The default implementation always returns "Lucene912". + * + *

WARNING: if you subclass, you are responsible for index backwards compatibility: + * future version of Lucene are only guaranteed to be able to read the default implementation. + */ + public KnnVectorsFormat getKnnVectorsFormatForField(String field) { + return defaultKnnVectorsFormat; + } + +} diff --git a/server/src/main/java/org/elasticsearch/index/codec/LegacyPerFieldMapperCodec.java b/server/src/main/java/org/elasticsearch/index/codec/LegacyPerFieldMapperCodec.java index 9e4ecb1a46c17..c3e9ab6617b87 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/LegacyPerFieldMapperCodec.java +++ b/server/src/main/java/org/elasticsearch/index/codec/LegacyPerFieldMapperCodec.java @@ -13,7 +13,7 @@ import org.apache.lucene.codecs.DocValuesFormat; import org.apache.lucene.codecs.KnnVectorsFormat; import org.apache.lucene.codecs.PostingsFormat; -import org.apache.lucene.codecs.lucene101.Lucene101Codec; +import org.apache.lucene.codecs.lucene103.Lucene103Codec; import org.elasticsearch.common.lucene.Lucene; import org.elasticsearch.common.util.BigArrays; import org.elasticsearch.index.mapper.MapperService; @@ -22,11 +22,11 @@ * Legacy version of {@link PerFieldMapperCodec}. This codec is preserved to give an escape hatch in case we encounter issues with new * changes in {@link PerFieldMapperCodec}. */ -public final class LegacyPerFieldMapperCodec extends Lucene101Codec { +public final class LegacyPerFieldMapperCodec extends Lucene103Codec { private final PerFieldFormatSupplier formatSupplier; - public LegacyPerFieldMapperCodec(Lucene101Codec.Mode compressionMode, MapperService mapperService, BigArrays bigArrays) { + public LegacyPerFieldMapperCodec(Lucene103Codec.Mode compressionMode, MapperService mapperService, BigArrays bigArrays) { super(compressionMode); this.formatSupplier = new PerFieldFormatSupplier(mapperService, bigArrays); // If the below assertion fails, it is a sign that Lucene released a new codec. You must create a copy of the current Elasticsearch diff --git a/server/src/main/java/org/elasticsearch/index/codec/PerFieldFormatSupplier.java b/server/src/main/java/org/elasticsearch/index/codec/PerFieldFormatSupplier.java index 4a79df430505d..8198963184763 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/PerFieldFormatSupplier.java +++ b/server/src/main/java/org/elasticsearch/index/codec/PerFieldFormatSupplier.java @@ -12,7 +12,6 @@ import org.apache.lucene.codecs.DocValuesFormat; import org.apache.lucene.codecs.KnnVectorsFormat; import org.apache.lucene.codecs.PostingsFormat; -import org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat; import org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat; import org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat; import org.elasticsearch.common.util.BigArrays; @@ -34,13 +33,12 @@ * vectors. */ public class PerFieldFormatSupplier { - public static final FeatureFlag USE_LUCENE101_POSTINGS_FORMAT = new FeatureFlag("use_lucene101_postings_format"); + public static final FeatureFlag USE_DEFAULT_LUCENE_POSTINGS_FORMAT = new FeatureFlag("use_default_lucene_postings_format"); private static final DocValuesFormat docValuesFormat = new Lucene90DocValuesFormat(); private static final KnnVectorsFormat knnVectorsFormat = new Lucene99HnswVectorsFormat(); private static final ES819TSDBDocValuesFormat tsdbDocValuesFormat = new ES819TSDBDocValuesFormat(); private static final ES812PostingsFormat es812PostingsFormat = new ES812PostingsFormat(); - private static final Lucene101PostingsFormat lucene101PostingsFormat = new Lucene101PostingsFormat(); private static final PostingsFormat completionPostingsFormat = PostingsFormat.forName("Completion101"); private final ES87BloomFilterPostingsFormat bloomFilterPostingsFormat; @@ -53,10 +51,10 @@ public PerFieldFormatSupplier(MapperService mapperService, BigArrays bigArrays) this.bloomFilterPostingsFormat = new ES87BloomFilterPostingsFormat(bigArrays, this::internalGetPostingsFormatForField); if (mapperService != null - && USE_LUCENE101_POSTINGS_FORMAT.isEnabled() - && mapperService.getIndexSettings().getIndexVersionCreated().onOrAfter(IndexVersions.USE_LUCENE101_POSTINGS_FORMAT) + && USE_DEFAULT_LUCENE_POSTINGS_FORMAT.isEnabled() + && mapperService.getIndexSettings().getIndexVersionCreated().onOrAfter(IndexVersions.UPGRADE_TO_LUCENE_10_3_0) && mapperService.getIndexSettings().getMode() == IndexMode.STANDARD) { - defaultPostingsFormat = lucene101PostingsFormat; + defaultPostingsFormat = Elasticsearch92Lucene103Codec.DEFAULT_POSTINGS_FORMAT; } else { // our own posting format using PFOR defaultPostingsFormat = es812PostingsFormat; diff --git a/server/src/main/java/org/elasticsearch/index/codec/PerFieldMapperCodec.java b/server/src/main/java/org/elasticsearch/index/codec/PerFieldMapperCodec.java index 9a3055f96bba8..0ffb63270fd58 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/PerFieldMapperCodec.java +++ b/server/src/main/java/org/elasticsearch/index/codec/PerFieldMapperCodec.java @@ -26,7 +26,7 @@ * per index in real time via the mapping API. If no specific postings format or vector format is * configured for a specific field the default postings or vector format is used. */ -public final class PerFieldMapperCodec extends Elasticsearch900Lucene101Codec { +public final class PerFieldMapperCodec extends Elasticsearch92Lucene103Codec { private final PerFieldFormatSupplier formatSupplier; diff --git a/server/src/main/java/org/elasticsearch/index/codec/postings/ES812PostingsFormat.java b/server/src/main/java/org/elasticsearch/index/codec/postings/ES812PostingsFormat.java index 6ccfaba7853f2..4fb6bcd00b1fb 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/postings/ES812PostingsFormat.java +++ b/server/src/main/java/org/elasticsearch/index/codec/postings/ES812PostingsFormat.java @@ -19,6 +19,7 @@ */ package org.elasticsearch.index.codec.postings; +import org.apache.lucene.backward_codecs.lucene90.blocktree.Lucene90BlockTreeTermsReader; import org.apache.lucene.codecs.BlockTermState; import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.codecs.FieldsConsumer; @@ -27,8 +28,6 @@ import org.apache.lucene.codecs.PostingsFormat; import org.apache.lucene.codecs.PostingsReaderBase; import org.apache.lucene.codecs.PostingsWriterBase; -import org.apache.lucene.codecs.lucene90.blocktree.Lucene90BlockTreeTermsReader; -import org.apache.lucene.codecs.lucene90.blocktree.Lucene90BlockTreeTermsWriter; import org.apache.lucene.index.IndexOptions; import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SegmentWriteState; @@ -100,7 +99,7 @@ *

Term Dictionary *

The .tim file contains the list of terms in each field along with per-term statistics * (such as docfreq) and pointers to the frequencies, positions, payload and skip data in the - * .doc, .pos, and .pay files. See {@link Lucene90BlockTreeTermsWriter} for more details on + * .doc, .pos, and .pay files. See {@link Lucene90BlockTreeTermsReader} for more details on * the format. *

NOTE: The term dictionary can plug into different postings implementations: the postings * writer/reader are actually responsible for encoding and decoding the PostingsHeader and @@ -155,7 +154,7 @@ *

*
Term Index *

The .tip file contains an index into the term dictionary, so that it can be accessed - * randomly. See {@link Lucene90BlockTreeTermsWriter} for more details on the format. + * randomly. See {@link Lucene90BlockTreeTermsReader} for more details on the format. *

* * @@ -343,7 +342,7 @@ * * */ -public final class ES812PostingsFormat extends PostingsFormat { +public class ES812PostingsFormat extends PostingsFormat { /** * Filename extension for document number, frequencies, and skip data. See chapter: Writes terms dict and index, block-encoding (column stride) each term's metadata for each set + * of terms between two index terms. + * + *

Files: + * + *

+ * + *

+ * + *

Term Dictionary

+ * + *

The .tim file contains the list of terms in each field along with per-term statistics (such as + * docfreq) and per-term metadata (typically pointers to the postings list for that term in the + * inverted index). + * + *

The .tim is arranged in blocks: with blocks containing a variable number of entries (by + * default 25-48), where each entry is either a term or a reference to a sub-block. + * + *

NOTE: The term dictionary can plug into different postings implementations: the postings + * writer/reader are actually responsible for encoding and decoding the Postings Metadata and Term + * Metadata sections. + * + *

+ * + *

Notes: + * + *

+ * + *

+ * + *

Term Metadata

+ * + *

The .tmd file contains the list of term metadata (such as FST index metadata) and field level + * statistics (such as sum of total term freq). + * + *

+ * + *

Notes: + * + *

+ * + * + * + *

Term Index

+ * + *

The .tip file contains an index into the term dictionary, so that it can be accessed randomly. + * The index is also used to determine when a given term cannot exist on disk (in the .tim file), + * saving a disk seek. + * + *

+ * + *

Notes: + * + *

+ * + * @see Lucene90BlockTreeTermsReader + */ +public final class Lucene90BlockTreeTermsWriter extends FieldsConsumer { + + /** + * Suggested default value for the {@code minItemsInBlock} parameter to {@link + * #Lucene90BlockTreeTermsWriter(SegmentWriteState,PostingsWriterBase,int,int)}. + */ + public static final int DEFAULT_MIN_BLOCK_SIZE = 25; + + /** + * Suggested default value for the {@code maxItemsInBlock} parameter to {@link + * #Lucene90BlockTreeTermsWriter(SegmentWriteState,PostingsWriterBase,int,int)}. + */ + public static final int DEFAULT_MAX_BLOCK_SIZE = 48; + + public static final int OUTPUT_FLAGS_NUM_BITS = 2; + public static final int OUTPUT_FLAGS_MASK = 0x3; + public static final int OUTPUT_FLAG_IS_FLOOR = 0x1; + public static final int OUTPUT_FLAG_HAS_TERMS = 0x2; + + /** Extension of terms meta file */ + static final String TERMS_EXTENSION = "tim"; + static final String TERMS_CODEC_NAME = "BlockTreeTermsDict"; + + // public static boolean DEBUG = false; + // public static boolean DEBUG2 = false; + + // private final static boolean SAVE_DOT_FILES = false; + + private final IndexOutput metaOut; + private final IndexOutput termsOut; + private final IndexOutput indexOut; + final int maxDoc; + final int minItemsInBlock; + final int maxItemsInBlock; + final int version; + + final PostingsWriterBase postingsWriter; + final FieldInfos fieldInfos; + + private final List fields = new ArrayList<>(); + + /** + * Create a new writer. The number of items (terms or sub-blocks) per block will aim to be between + * minItemsPerBlock and maxItemsPerBlock, though in some cases the blocks may be smaller than the + * min. + */ + public Lucene90BlockTreeTermsWriter( + SegmentWriteState state, + PostingsWriterBase postingsWriter, + int minItemsInBlock, + int maxItemsInBlock + ) throws IOException { + this(state, postingsWriter, minItemsInBlock, maxItemsInBlock, Lucene90BlockTreeTermsReader.VERSION_CURRENT); + } + + /** Expert constructor that allows configuring the version, used for bw tests. */ + public Lucene90BlockTreeTermsWriter( + SegmentWriteState state, + PostingsWriterBase postingsWriter, + int minItemsInBlock, + int maxItemsInBlock, + int version + ) throws IOException { + validateSettings(minItemsInBlock, maxItemsInBlock); + + this.minItemsInBlock = minItemsInBlock; + this.maxItemsInBlock = maxItemsInBlock; + if (version < Lucene90BlockTreeTermsReader.VERSION_START || version > Lucene90BlockTreeTermsReader.VERSION_CURRENT) { + throw new IllegalArgumentException( + "Expected version in range [" + + Lucene90BlockTreeTermsReader.VERSION_START + + ", " + + Lucene90BlockTreeTermsReader.VERSION_CURRENT + + "], but got " + + version + ); + } + this.version = version; + + this.maxDoc = state.segmentInfo.maxDoc(); + this.fieldInfos = state.fieldInfos; + this.postingsWriter = postingsWriter; + + final String termsName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, TERMS_EXTENSION); + termsOut = state.directory.createOutput(termsName, state.context); + boolean success = false; + IndexOutput metaOut = null, indexOut = null; + try { + CodecUtil.writeIndexHeader(termsOut, TERMS_CODEC_NAME, version, state.segmentInfo.getId(), state.segmentSuffix); + + final String indexName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, TERMS_INDEX_EXTENSION); + indexOut = state.directory.createOutput(indexName, state.context); + CodecUtil.writeIndexHeader(indexOut, TERMS_INDEX_CODEC_NAME, version, state.segmentInfo.getId(), state.segmentSuffix); + // segment = state.segmentInfo.name; + + final String metaName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, TERMS_META_EXTENSION); + metaOut = state.directory.createOutput(metaName, state.context); + CodecUtil.writeIndexHeader(metaOut, TERMS_META_CODEC_NAME, version, state.segmentInfo.getId(), state.segmentSuffix); + + postingsWriter.init(metaOut, state); // have consumer write its format/header + + this.metaOut = metaOut; + this.indexOut = indexOut; + success = true; + } finally { + if (success == false) { + IOUtils.closeWhileHandlingException(metaOut, termsOut, indexOut); + } + } + } + + /** Throws {@code IllegalArgumentException} if any of these settings is invalid. */ + public static void validateSettings(int minItemsInBlock, int maxItemsInBlock) { + if (minItemsInBlock <= 1) { + throw new IllegalArgumentException("minItemsInBlock must be >= 2; got " + minItemsInBlock); + } + if (minItemsInBlock > maxItemsInBlock) { + throw new IllegalArgumentException( + "maxItemsInBlock must be >= minItemsInBlock; got maxItemsInBlock=" + maxItemsInBlock + " minItemsInBlock=" + minItemsInBlock + ); + } + if (2 * (minItemsInBlock - 1) > maxItemsInBlock) { + throw new IllegalArgumentException( + "maxItemsInBlock must be at least 2*(minItemsInBlock-1); got maxItemsInBlock=" + + maxItemsInBlock + + " minItemsInBlock=" + + minItemsInBlock + ); + } + } + + @Override + public void write(Fields fields, NormsProducer norms) throws IOException { + // if (DEBUG) System.out.println("\nBTTW.write seg=" + segment); + + String lastField = null; + for (String field : fields) { + assert lastField == null || lastField.compareTo(field) < 0; + lastField = field; + + // if (DEBUG) System.out.println("\nBTTW.write seg=" + segment + " field=" + field); + Terms terms = fields.terms(field); + if (terms == null) { + continue; + } + + TermsEnum termsEnum = terms.iterator(); + TermsWriter termsWriter = new TermsWriter(fieldInfos.fieldInfo(field)); + while (true) { + BytesRef term = termsEnum.next(); + // if (DEBUG) System.out.println("BTTW: next term " + term); + + if (term == null) { + break; + } + + // if (DEBUG) System.out.println("write field=" + fieldInfo.name + " term=" + + // ToStringUtils.bytesRefToString(term)); + termsWriter.write(term, termsEnum, norms); + } + + termsWriter.finish(); + + // if (DEBUG) System.out.println("\nBTTW.write done seg=" + segment + " field=" + field); + } + } + + static long encodeOutput(long fp, boolean hasTerms, boolean isFloor) { + assert fp < (1L << 62); + return (fp << 2) | (hasTerms ? OUTPUT_FLAG_HAS_TERMS : 0) | (isFloor ? OUTPUT_FLAG_IS_FLOOR : 0); + } + + private static class PendingEntry { + public final boolean isTerm; + + protected PendingEntry(boolean isTerm) { + this.isTerm = isTerm; + } + } + + private static final class PendingTerm extends PendingEntry { + public final byte[] termBytes; + // stats + metadata + public final BlockTermState state; + + PendingTerm(BytesRef term, BlockTermState state) { + super(true); + this.termBytes = new byte[term.length]; + System.arraycopy(term.bytes, term.offset, termBytes, 0, term.length); + this.state = state; + } + + @Override + public String toString() { + return "TERM: " + ToStringUtils.bytesRefToString(termBytes); + } + } + + /** + * Encodes long value to variable length byte[], in MSB order. Use {@link + * FieldReader readMSBVLong} to decode. + * + *

Package private for testing + */ + static void writeMSBVLong(long l, DataOutput scratchBytes) throws IOException { + assert l >= 0; + // Keep zero bits on most significant byte to have more chance to get prefix bytes shared. + // e.g. we expect 0x7FFF stored as [0x81, 0xFF, 0x7F] but not [0xFF, 0xFF, 0x40] + final int bytesNeeded = (Long.SIZE - Long.numberOfLeadingZeros(l) - 1) / 7 + 1; + l <<= Long.SIZE - bytesNeeded * 7; + for (int i = 1; i < bytesNeeded; i++) { + scratchBytes.writeByte((byte) (((l >>> 57) & 0x7FL) | 0x80)); + l = l << 7; + } + scratchBytes.writeByte((byte) (((l >>> 57) & 0x7FL))); + } + + private final class PendingBlock extends PendingEntry { + public final BytesRef prefix; + public final long fp; + public FST index; + public List> subIndices; + public final boolean hasTerms; + public final boolean isFloor; + public final int floorLeadByte; + + PendingBlock(BytesRef prefix, long fp, boolean hasTerms, boolean isFloor, int floorLeadByte, List> subIndices) { + super(false); + this.prefix = prefix; + this.fp = fp; + this.hasTerms = hasTerms; + this.isFloor = isFloor; + this.floorLeadByte = floorLeadByte; + this.subIndices = subIndices; + } + + @Override + public String toString() { + return "BLOCK: prefix=" + ToStringUtils.bytesRefToString(prefix); + } + + public void compileIndex(List blocks, ByteBuffersDataOutput scratchBytes, IntsRefBuilder scratchIntsRef) + throws IOException { + + assert (isFloor && blocks.size() > 1) || (isFloor == false && blocks.size() == 1) : "isFloor=" + isFloor + " blocks=" + blocks; + assert this == blocks.get(0); + + assert scratchBytes.size() == 0; + + // write the leading vLong in MSB order for better outputs sharing in the FST + if (version >= Lucene90BlockTreeTermsReader.VERSION_MSB_VLONG_OUTPUT) { + writeMSBVLong(encodeOutput(fp, hasTerms, isFloor), scratchBytes); + } else { + scratchBytes.writeVLong(encodeOutput(fp, hasTerms, isFloor)); + } + if (isFloor) { + scratchBytes.writeVInt(blocks.size() - 1); + for (int i = 1; i < blocks.size(); i++) { + PendingBlock sub = blocks.get(i); + assert sub.floorLeadByte != -1; + // if (DEBUG) { + // System.out.println(" write floorLeadByte=" + + // Integer.toHexString(sub.floorLeadByte&0xff)); + // } + scratchBytes.writeByte((byte) sub.floorLeadByte); + assert sub.fp > fp; + scratchBytes.writeVLong((sub.fp - fp) << 1 | (sub.hasTerms ? 1 : 0)); + } + } + + long estimateSize = prefix.length; + for (PendingBlock block : blocks) { + if (block.subIndices != null) { + for (FST subIndex : block.subIndices) { + estimateSize += subIndex.numBytes(); + } + } + } + int estimateBitsRequired = PackedInts.bitsRequired(estimateSize); + int pageBits = Math.min(15, Math.max(6, estimateBitsRequired)); + + final ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton(); + final int fstVersion; + if (version >= Lucene90BlockTreeTermsReader.VERSION_CURRENT) { + fstVersion = FST.VERSION_CURRENT; + } else { + fstVersion = FST.VERSION_90; + } + final FSTCompiler fstCompiler = new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, outputs) + // Disable suffixes sharing for block tree index because suffixes are mostly dropped + // from the FST index and left in the term blocks. + .suffixRAMLimitMB(0d) + .dataOutput(getOnHeapReaderWriter(pageBits)) + .setVersion(fstVersion) + .build(); + // if (DEBUG) { + // System.out.println(" compile index for prefix=" + prefix); + // } + // indexBuilder.DEBUG = false; + final byte[] bytes = scratchBytes.toArrayCopy(); + assert bytes.length > 0; + fstCompiler.add(Util.toIntsRef(prefix, scratchIntsRef), new BytesRef(bytes, 0, bytes.length)); + scratchBytes.reset(); + + // Copy over index for all sub-blocks + for (PendingBlock block : blocks) { + if (block.subIndices != null) { + for (FST subIndex : block.subIndices) { + append(fstCompiler, subIndex, scratchIntsRef); + } + block.subIndices = null; + } + } + + index = FST.fromFSTReader(fstCompiler.compile(), fstCompiler.getFSTReader()); + + assert subIndices == null; + + /* + Writer w = new OutputStreamWriter(new FileOutputStream("out.dot")); + Util.toDot(index, w, false, false); + System.out.println("SAVED to out.dot"); + w.close(); + */ + } + + // TODO: maybe we could add bulk-add method to + // Builder? Takes FST and unions it w/ current + // FST. + private void append(FSTCompiler fstCompiler, FST subIndex, IntsRefBuilder scratchIntsRef) throws IOException { + final BytesRefFSTEnum subIndexEnum = new BytesRefFSTEnum<>(subIndex); + BytesRefFSTEnum.InputOutput indexEnt; + while ((indexEnt = subIndexEnum.next()) != null) { + // if (DEBUG) { + // System.out.println(" add sub=" + indexEnt.input + " " + indexEnt.input + " output=" + // + indexEnt.output); + // } + fstCompiler.add(Util.toIntsRef(indexEnt.input, scratchIntsRef), indexEnt.output); + } + } + } + + private final ByteBuffersDataOutput scratchBytes = ByteBuffersDataOutput.newResettableInstance(); + private final IntsRefBuilder scratchIntsRef = new IntsRefBuilder(); + + private static class StatsWriter { + + private final DataOutput out; + private final boolean hasFreqs; + private int singletonCount; + + StatsWriter(DataOutput out, boolean hasFreqs) { + this.out = out; + this.hasFreqs = hasFreqs; + } + + void add(int df, long ttf) throws IOException { + // Singletons (DF==1, TTF==1) are run-length encoded + if (df == 1 && (hasFreqs == false || ttf == 1)) { + singletonCount++; + } else { + finish(); + out.writeVInt(df << 1); + if (hasFreqs) { + out.writeVLong(ttf - df); + } + } + } + + void finish() throws IOException { + if (singletonCount > 0) { + out.writeVInt(((singletonCount - 1) << 1) | 1); + singletonCount = 0; + } + } + } + + class TermsWriter { + private final FieldInfo fieldInfo; + private long numTerms; + final FixedBitSet docsSeen; + long sumTotalTermFreq; + long sumDocFreq; + + // Records index into pending where the current prefix at that + // length "started"; for example, if current term starts with 't', + // startsByPrefix[0] is the index into pending for the first + // term/sub-block starting with 't'. We use this to figure out when + // to write a new block: + private final BytesRefBuilder lastTerm = new BytesRefBuilder(); + private int[] prefixStarts = new int[8]; + + // Pending stack of terms and blocks. As terms arrive (in sorted order) + // we append to this stack, and once the top of the stack has enough + // terms starting with a common prefix, we write a new block with + // those terms and replace those terms in the stack with a new block: + private final List pending = new ArrayList<>(); + + // Reused in writeBlocks: + private final List newBlocks = new ArrayList<>(); + + private PendingTerm firstPendingTerm; + private PendingTerm lastPendingTerm; + + /** Writes the top count entries in pending, using prevTerm to compute the prefix. */ + void writeBlocks(int prefixLength, int count) throws IOException { + + assert count > 0; + + // if (DEBUG2) { + // BytesRef br = new BytesRef(lastTerm.bytes()); + // br.length = prefixLength; + // System.out.println("writeBlocks: seg=" + segment + " prefix=" + + // ToStringUtils.bytesRefToString(br) + " count=" + count); + // } + + // Root block better write all remaining pending entries: + assert prefixLength > 0 || count == pending.size(); + + int lastSuffixLeadLabel = -1; + + // True if we saw at least one term in this block (we record if a block + // only points to sub-blocks in the terms index so we can avoid seeking + // to it when we are looking for a term): + boolean hasTerms = false; + boolean hasSubBlocks = false; + + int start = pending.size() - count; + int end = pending.size(); + int nextBlockStart = start; + int nextFloorLeadLabel = -1; + + for (int i = start; i < end; i++) { + + PendingEntry ent = pending.get(i); + + int suffixLeadLabel; + + if (ent.isTerm) { + PendingTerm term = (PendingTerm) ent; + if (term.termBytes.length == prefixLength) { + // Suffix is 0, i.e. prefix 'foo' and term is + // 'foo' so the term has empty string suffix + // in this block + assert lastSuffixLeadLabel == -1 : "i=" + i + " lastSuffixLeadLabel=" + lastSuffixLeadLabel; + suffixLeadLabel = -1; + } else { + suffixLeadLabel = term.termBytes[prefixLength] & 0xff; + } + } else { + PendingBlock block = (PendingBlock) ent; + assert block.prefix.length > prefixLength; + suffixLeadLabel = block.prefix.bytes[block.prefix.offset + prefixLength] & 0xff; + } + // if (DEBUG) System.out.println(" i=" + i + " ent=" + ent + " suffixLeadLabel=" + + // suffixLeadLabel); + + if (suffixLeadLabel != lastSuffixLeadLabel) { + int itemsInBlock = i - nextBlockStart; + if (itemsInBlock >= minItemsInBlock && end - nextBlockStart > maxItemsInBlock) { + // The count is too large for one block, so we must break it into "floor" blocks, where + // we record + // the leading label of the suffix of the first term in each floor block, so at search + // time we can + // jump to the right floor block. We just use a naive greedy segmenter here: make a new + // floor + // block as soon as we have at least minItemsInBlock. This is not always best: it often + // produces + // a too-small block as the final block: + boolean isFloor = itemsInBlock < count; + newBlocks.add(writeBlock(prefixLength, isFloor, nextFloorLeadLabel, nextBlockStart, i, hasTerms, hasSubBlocks)); + + hasTerms = false; + hasSubBlocks = false; + nextFloorLeadLabel = suffixLeadLabel; + nextBlockStart = i; + } + + lastSuffixLeadLabel = suffixLeadLabel; + } + + if (ent.isTerm) { + hasTerms = true; + } else { + hasSubBlocks = true; + } + } + + // Write last block, if any: + if (nextBlockStart < end) { + int itemsInBlock = end - nextBlockStart; + boolean isFloor = itemsInBlock < count; + newBlocks.add(writeBlock(prefixLength, isFloor, nextFloorLeadLabel, nextBlockStart, end, hasTerms, hasSubBlocks)); + } + + assert newBlocks.isEmpty() == false; + + PendingBlock firstBlock = newBlocks.get(0); + + assert firstBlock.isFloor || newBlocks.size() == 1; + + firstBlock.compileIndex(newBlocks, scratchBytes, scratchIntsRef); + + // Remove slice from the top of the pending stack, that we just wrote: + pending.subList(pending.size() - count, pending.size()).clear(); + + // Append new block + pending.add(firstBlock); + + newBlocks.clear(); + } + + private boolean allEqual(byte[] b, int startOffset, int endOffset, byte value) { + Objects.checkFromToIndex(startOffset, endOffset, b.length); + for (int i = startOffset; i < endOffset; ++i) { + if (b[i] != value) { + return false; + } + } + return true; + } + + /** + * Writes the specified slice (start is inclusive, end is exclusive) from pending stack as a new + * block. If isFloor is true, there were too many (more than maxItemsInBlock) entries sharing + * the same prefix, and so we broke it into multiple floor blocks where we record the starting + * label of the suffix of each floor block. + */ + private PendingBlock writeBlock( + int prefixLength, + boolean isFloor, + int floorLeadLabel, + int start, + int end, + boolean hasTerms, + boolean hasSubBlocks + ) throws IOException { + + assert end > start; + + long startFP = termsOut.getFilePointer(); + + boolean hasFloorLeadLabel = isFloor && floorLeadLabel != -1; + + final BytesRef prefix = new BytesRef(prefixLength + (hasFloorLeadLabel ? 1 : 0)); + System.arraycopy(lastTerm.get().bytes, 0, prefix.bytes, 0, prefixLength); + prefix.length = prefixLength; + + // if (DEBUG2) System.out.println(" writeBlock field=" + fieldInfo.name + " prefix=" + + // ToStringUtils.bytesRefToString(prefix) + " fp=" + startFP + " isFloor=" + isFloor + + // " isLastInFloor=" + (end == pending.size()) + " floorLeadLabel=" + floorLeadLabel + + // " start=" + start + " end=" + end + " hasTerms=" + hasTerms + " hasSubBlocks=" + + // hasSubBlocks); + + // Write block header: + int numEntries = end - start; + int code = numEntries << 1; + if (end == pending.size()) { + // Last block: + code |= 1; + } + termsOut.writeVInt(code); + + /* + if (DEBUG) { + System.out.println(" writeBlock " + (isFloor ? "(floor) " : "") + "seg=" + segment + " pending.size()=" + + pending.size() + " prefixLength=" + prefixLength + " indexPrefix=" + ToStringUtils.bytesRefToString(prefix) + + " entCount="+(end-start+1) +" startFP="+startFP+(isFloor ? (" floorLeadLabel=" + Integer.toHexString(floorLeadLabel)) : "")); + } + */ + + // 1st pass: pack term suffix bytes into byte[] blob + // TODO: cutover to bulk int codec... simple64? + + // We optimize the leaf block case (block has only terms), writing a more + // compact format in this case: + boolean isLeafBlock = hasSubBlocks == false; + + // System.out.println(" isLeaf=" + isLeafBlock); + + final List> subIndices; + + boolean absolute = true; + + if (isLeafBlock) { + // Block contains only ordinary terms: + subIndices = null; + StatsWriter statsWriter = new StatsWriter(this.statsWriter, fieldInfo.getIndexOptions() != IndexOptions.DOCS); + for (int i = start; i < end; i++) { + PendingEntry ent = pending.get(i); + assert ent.isTerm : "i=" + i; + + PendingTerm term = (PendingTerm) ent; + + assert StringHelper.startsWith(term.termBytes, prefix) : term + " prefix=" + prefix; + BlockTermState state = term.state; + final int suffix = term.termBytes.length - prefixLength; + // if (DEBUG2) { + // BytesRef suffixBytes = new BytesRef(suffix); + // System.arraycopy(term.termBytes, prefixLength, suffixBytes.bytes, 0, suffix); + // suffixBytes.length = suffix; + // System.out.println(" write term suffix=" + + // ToStringUtils.bytesRefToString(suffixBytes)); + // } + + // For leaf block we write suffix straight + suffixLengthsWriter.writeVInt(suffix); + suffixWriter.append(term.termBytes, prefixLength, suffix); + assert floorLeadLabel == -1 || (term.termBytes[prefixLength] & 0xff) >= floorLeadLabel; + + // Write term stats, to separate byte[] blob: + statsWriter.add(state.docFreq, state.totalTermFreq); + + // Write term meta data + postingsWriter.encodeTerm(metaWriter, fieldInfo, state, absolute); + absolute = false; + } + statsWriter.finish(); + } else { + // Block has at least one prefix term or a sub block: + subIndices = new ArrayList<>(); + StatsWriter statsWriter = new StatsWriter(this.statsWriter, fieldInfo.getIndexOptions() != IndexOptions.DOCS); + for (int i = start; i < end; i++) { + PendingEntry ent = pending.get(i); + if (ent.isTerm) { + PendingTerm term = (PendingTerm) ent; + + assert StringHelper.startsWith(term.termBytes, prefix) : term + " prefix=" + prefix; + BlockTermState state = term.state; + final int suffix = term.termBytes.length - prefixLength; + // if (DEBUG2) { + // BytesRef suffixBytes = new BytesRef(suffix); + // System.arraycopy(term.termBytes, prefixLength, suffixBytes.bytes, 0, suffix); + // suffixBytes.length = suffix; + // System.out.println(" write term suffix=" + + // ToStringUtils.bytesRefToString(suffixBytes)); + // } + + // For non-leaf block we borrow 1 bit to record + // if entry is term or sub-block, and 1 bit to record if + // it's a prefix term. Terms cannot be larger than ~32 KB + // so we won't run out of bits: + + suffixLengthsWriter.writeVInt(suffix << 1); + suffixWriter.append(term.termBytes, prefixLength, suffix); + + // Write term stats, to separate byte[] blob: + statsWriter.add(state.docFreq, state.totalTermFreq); + + // TODO: now that terms dict "sees" these longs, + // we can explore better column-stride encodings + // to encode all long[0]s for this block at + // once, all long[1]s, etc., e.g. using + // Simple64. Alternatively, we could interleave + // stats + meta ... no reason to have them + // separate anymore: + + // Write term meta data + postingsWriter.encodeTerm(metaWriter, fieldInfo, state, absolute); + absolute = false; + } else { + PendingBlock block = (PendingBlock) ent; + assert StringHelper.startsWith(block.prefix, prefix); + final int suffix = block.prefix.length - prefixLength; + assert StringHelper.startsWith(block.prefix, prefix); + + assert suffix > 0; + + // For non-leaf block we borrow 1 bit to record + // if entry is term or sub-block:f + suffixLengthsWriter.writeVInt((suffix << 1) | 1); + suffixWriter.append(block.prefix.bytes, prefixLength, suffix); + + // if (DEBUG2) { + // BytesRef suffixBytes = new BytesRef(suffix); + // System.arraycopy(block.prefix.bytes, prefixLength, suffixBytes.bytes, 0, suffix); + // suffixBytes.length = suffix; + // System.out.println(" write sub-block suffix=" + + // ToStringUtils.bytesRefToString(suffixBytes) + " subFP=" + block.fp + " subCode=" + + // (startFP-block.fp) + " floor=" + block.isFloor); + // } + + assert floorLeadLabel == -1 || (block.prefix.bytes[prefixLength] & 0xff) >= floorLeadLabel + : "floorLeadLabel=" + floorLeadLabel + " suffixLead=" + (block.prefix.bytes[prefixLength] & 0xff); + assert block.fp < startFP; + + suffixLengthsWriter.writeVLong(startFP - block.fp); + subIndices.add(block.index); + } + } + statsWriter.finish(); + + assert subIndices.size() != 0; + } + + // Write suffixes byte[] blob to terms dict output, either uncompressed, compressed with LZ4 + // or with LowercaseAsciiCompression. + CompressionAlgorithm compressionAlg = CompressionAlgorithm.NO_COMPRESSION; + // If there are 2 suffix bytes or less per term, then we don't bother compressing as suffix + // are unlikely what + // makes the terms dictionary large, and it also tends to be frequently the case for dense IDs + // like + // auto-increment IDs, so not compressing in that case helps not hurt ID lookups by too much. + // We also only start compressing when the prefix length is greater than 2 since blocks whose + // prefix length is + // 1 or 2 always all get visited when running a fuzzy query whose max number of edits is 2. + if (suffixWriter.length() > 2L * numEntries && prefixLength > 2) { + // LZ4 inserts references whenever it sees duplicate strings of 4 chars or more, so only try + // it out if the + // average suffix length is greater than 6. + if (suffixWriter.length() > 6L * numEntries) { + if (compressionHashTable == null) { + compressionHashTable = new LZ4.HighCompressionHashTable(); + } + LZ4.compress(suffixWriter.bytes(), 0, suffixWriter.length(), spareWriter, compressionHashTable); + if (spareWriter.size() < suffixWriter.length() - (suffixWriter.length() >>> 2)) { + // LZ4 saved more than 25%, go for it + compressionAlg = CompressionAlgorithm.LZ4; + } + } + if (compressionAlg == CompressionAlgorithm.NO_COMPRESSION) { + spareWriter.reset(); + if (spareBytes.length < suffixWriter.length()) { + spareBytes = new byte[ArrayUtil.oversize(suffixWriter.length(), 1)]; + } + if (LowercaseAsciiCompression.compress(suffixWriter.bytes(), suffixWriter.length(), spareBytes, spareWriter)) { + compressionAlg = CompressionAlgorithm.LOWERCASE_ASCII; + } + } + } + long token = ((long) suffixWriter.length()) << 3; + if (isLeafBlock) { + token |= 0x04; + } + token |= compressionAlg.code; + termsOut.writeVLong(token); + if (compressionAlg == CompressionAlgorithm.NO_COMPRESSION) { + termsOut.writeBytes(suffixWriter.bytes(), suffixWriter.length()); + } else { + spareWriter.copyTo(termsOut); + } + suffixWriter.setLength(0); + spareWriter.reset(); + + // Write suffix lengths + final int numSuffixBytes = Math.toIntExact(suffixLengthsWriter.size()); + spareBytes = ArrayUtil.growNoCopy(spareBytes, numSuffixBytes); + suffixLengthsWriter.copyTo(new ByteArrayDataOutput(spareBytes)); + suffixLengthsWriter.reset(); + if (allEqual(spareBytes, 1, numSuffixBytes, spareBytes[0])) { + // Structured fields like IDs often have most values of the same length + termsOut.writeVInt((numSuffixBytes << 1) | 1); + termsOut.writeByte(spareBytes[0]); + } else { + termsOut.writeVInt(numSuffixBytes << 1); + termsOut.writeBytes(spareBytes, numSuffixBytes); + } + + // Stats + final int numStatsBytes = Math.toIntExact(statsWriter.size()); + termsOut.writeVInt(numStatsBytes); + statsWriter.copyTo(termsOut); + statsWriter.reset(); + + // Write term meta data byte[] blob + termsOut.writeVInt((int) metaWriter.size()); + metaWriter.copyTo(termsOut); + metaWriter.reset(); + + // if (DEBUG) { + // System.out.println(" fpEnd=" + out.getFilePointer()); + // } + + if (hasFloorLeadLabel) { + // We already allocated to length+1 above: + prefix.bytes[prefix.length++] = (byte) floorLeadLabel; + } + + return new PendingBlock(prefix, startFP, hasTerms, isFloor, floorLeadLabel, subIndices); + } + + TermsWriter(FieldInfo fieldInfo) { + this.fieldInfo = fieldInfo; + assert fieldInfo.getIndexOptions() != IndexOptions.NONE; + docsSeen = new FixedBitSet(maxDoc); + postingsWriter.setField(fieldInfo); + } + + /** Writes one term's worth of postings. */ + public void write(BytesRef text, TermsEnum termsEnum, NormsProducer norms) throws IOException { + /* + if (DEBUG) { + int[] tmp = new int[lastTerm.length]; + System.arraycopy(prefixStarts, 0, tmp, 0, tmp.length); + System.out.println("BTTW: write term=" + ToStringUtils.bytesRefToString(text) + " prefixStarts=" + Arrays.toString(tmp) + + " pending.size()=" + pending.size()); + } + */ + + BlockTermState state = postingsWriter.writeTerm(text, termsEnum, docsSeen, norms); + if (state != null) { + + assert state.docFreq != 0; + assert fieldInfo.getIndexOptions() == IndexOptions.DOCS || state.totalTermFreq >= state.docFreq + : "postingsWriter=" + postingsWriter; + pushTerm(text); + + PendingTerm term = new PendingTerm(text, state); + pending.add(term); + // if (DEBUG) System.out.println(" add pending term = " + text + " pending.size()=" + + // pending.size()); + + sumDocFreq += state.docFreq; + sumTotalTermFreq += state.totalTermFreq; + numTerms++; + if (firstPendingTerm == null) { + firstPendingTerm = term; + } + lastPendingTerm = term; + } + } + + /** Pushes the new term to the top of the stack, and writes new blocks. */ + private void pushTerm(BytesRef text) throws IOException { + // Find common prefix between last term and current term: + int prefixLength = Arrays.mismatch(lastTerm.bytes(), 0, lastTerm.length(), text.bytes, text.offset, text.offset + text.length); + if (prefixLength == -1) { // Only happens for the first term, if it is empty + assert lastTerm.length() == 0; + prefixLength = 0; + } + + // if (DEBUG) System.out.println(" shared=" + pos + " lastTerm.length=" + lastTerm.length); + + // Close the "abandoned" suffix now: + for (int i = lastTerm.length() - 1; i >= prefixLength; i--) { + + // How many items on top of the stack share the current suffix + // we are closing: + int prefixTopSize = pending.size() - prefixStarts[i]; + if (prefixTopSize >= minItemsInBlock) { + // if (DEBUG) System.out.println("pushTerm i=" + i + " prefixTopSize=" + prefixTopSize + + // " minItemsInBlock=" + minItemsInBlock); + writeBlocks(i + 1, prefixTopSize); + prefixStarts[i] -= prefixTopSize - 1; + } + } + + if (prefixStarts.length < text.length) { + prefixStarts = ArrayUtil.grow(prefixStarts, text.length); + } + + // Init new tail: + for (int i = prefixLength; i < text.length; i++) { + prefixStarts[i] = pending.size(); + } + + lastTerm.copyBytes(text); + } + + // Finishes all terms in this field + public void finish() throws IOException { + if (numTerms > 0) { + // if (DEBUG) System.out.println("BTTW: finish prefixStarts=" + + // Arrays.toString(prefixStarts)); + + // Add empty term to force closing of all final blocks: + pushTerm(new BytesRef()); + + // TODO: if pending.size() is already 1 with a non-zero prefix length + // we can save writing a "degenerate" root block, but we have to + // fix all the places that assume the root block's prefix is the empty string: + pushTerm(new BytesRef()); + writeBlocks(0, pending.size()); + + // We better have one final "root" block: + assert pending.size() == 1 && pending.get(0).isTerm == false : "pending.size()=" + pending.size() + " pending=" + pending; + final PendingBlock root = (PendingBlock) pending.get(0); + assert root.prefix.length == 0; + final BytesRef rootCode = root.index.getEmptyOutput(); + assert rootCode != null; + + ByteBuffersDataOutput metaOut = new ByteBuffersDataOutput(); + fields.add(metaOut); + + metaOut.writeVInt(fieldInfo.number); + metaOut.writeVLong(numTerms); + metaOut.writeVInt(rootCode.length); + metaOut.writeBytes(rootCode.bytes, rootCode.offset, rootCode.length); + assert fieldInfo.getIndexOptions() != IndexOptions.NONE; + if (fieldInfo.getIndexOptions() != IndexOptions.DOCS) { + metaOut.writeVLong(sumTotalTermFreq); + } + metaOut.writeVLong(sumDocFreq); + metaOut.writeVInt(docsSeen.cardinality()); + writeBytesRef(metaOut, new BytesRef(firstPendingTerm.termBytes)); + writeBytesRef(metaOut, new BytesRef(lastPendingTerm.termBytes)); + metaOut.writeVLong(indexOut.getFilePointer()); + // Write FST to index + root.index.save(metaOut, indexOut); + // System.out.println(" write FST " + indexStartFP + " field=" + fieldInfo.name); + + /* + if (DEBUG) { + final String dotFileName = segment + "_" + fieldInfo.name + ".dot"; + Writer w = new OutputStreamWriter(new FileOutputStream(dotFileName)); + Util.toDot(root.index, w, false, false); + System.out.println("SAVED to " + dotFileName); + w.close(); + } + */ + + } else { + assert sumTotalTermFreq == 0 || fieldInfo.getIndexOptions() == IndexOptions.DOCS && sumTotalTermFreq == -1; + assert sumDocFreq == 0; + assert docsSeen.cardinality() == 0; + } + } + + private final ByteBuffersDataOutput suffixLengthsWriter = ByteBuffersDataOutput.newResettableInstance(); + private final BytesRefBuilder suffixWriter = new BytesRefBuilder(); + private final ByteBuffersDataOutput statsWriter = ByteBuffersDataOutput.newResettableInstance(); + private final ByteBuffersDataOutput metaWriter = ByteBuffersDataOutput.newResettableInstance(); + private final ByteBuffersDataOutput spareWriter = ByteBuffersDataOutput.newResettableInstance(); + private byte[] spareBytes = BytesRef.EMPTY_BYTES; + private LZ4.HighCompressionHashTable compressionHashTable; + } + + private boolean closed; + + @Override + public void close() throws IOException { + if (closed) { + return; + } + closed = true; + + boolean success = false; + try { + metaOut.writeVInt(fields.size()); + for (ByteBuffersDataOutput fieldMeta : fields) { + fieldMeta.copyTo(metaOut); + } + CodecUtil.writeFooter(indexOut); + metaOut.writeLong(indexOut.getFilePointer()); + CodecUtil.writeFooter(termsOut); + metaOut.writeLong(termsOut.getFilePointer()); + CodecUtil.writeFooter(metaOut); + success = true; + } finally { + if (success) { + IOUtils.close(metaOut, termsOut, indexOut, postingsWriter); + } else { + IOUtils.closeWhileHandlingException(metaOut, termsOut, indexOut, postingsWriter); + } + } + } + + private static void writeBytesRef(DataOutput out, BytesRef bytes) throws IOException { + out.writeVInt(bytes.length); + out.writeBytes(bytes.bytes, bytes.offset, bytes.length); + } +} diff --git a/server/src/main/java/org/elasticsearch/index/codec/vectors/es816/ES816BinaryQuantizedVectorsReader.java b/server/src/main/java/org/elasticsearch/index/codec/vectors/es816/ES816BinaryQuantizedVectorsReader.java index 5f290a72ba833..964e7c5f43000 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/vectors/es816/ES816BinaryQuantizedVectorsReader.java +++ b/server/src/main/java/org/elasticsearch/index/codec/vectors/es816/ES816BinaryQuantizedVectorsReader.java @@ -35,9 +35,11 @@ import org.apache.lucene.search.KnnCollector; import org.apache.lucene.search.VectorScorer; import org.apache.lucene.store.ChecksumIndexInput; +import org.apache.lucene.store.DataAccessHint; +import org.apache.lucene.store.FileDataHint; +import org.apache.lucene.store.FileTypeHint; import org.apache.lucene.store.IOContext; import org.apache.lucene.store.IndexInput; -import org.apache.lucene.store.ReadAdvice; import org.apache.lucene.util.Bits; import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.RamUsageEstimator; @@ -106,7 +108,7 @@ public class ES816BinaryQuantizedVectorsReader extends FlatVectorsReader { ES816BinaryQuantizedVectorsFormat.VECTOR_DATA_CODEC_NAME, // Quantized vectors are accessed randomly from their node ID stored in the HNSW // graph. - state.context.withReadAdvice(ReadAdvice.RANDOM) + state.context.withHints(FileTypeHint.DATA, FileDataHint.KNN_VECTORS, DataAccessHint.RANDOM) ); success = true; } finally { diff --git a/server/src/main/java/org/elasticsearch/index/codec/vectors/es818/DirectIOLucene99FlatVectorsFormat.java b/server/src/main/java/org/elasticsearch/index/codec/vectors/es818/DirectIOLucene99FlatVectorsFormat.java index a0cd6dbf65688..ebe226bbaa2d0 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/vectors/es818/DirectIOLucene99FlatVectorsFormat.java +++ b/server/src/main/java/org/elasticsearch/index/codec/vectors/es818/DirectIOLucene99FlatVectorsFormat.java @@ -30,10 +30,9 @@ import org.apache.lucene.store.FlushInfo; import org.apache.lucene.store.IOContext; import org.apache.lucene.store.MergeInfo; -import org.apache.lucene.store.ReadAdvice; +import org.elasticsearch.common.util.set.Sets; import java.io.IOException; -import java.util.Optional; import java.util.Set; /** @@ -67,7 +66,40 @@ public FlatVectorsWriter fieldsWriter(SegmentWriteState state) throws IOExceptio return new Lucene99FlatVectorsWriter(state, vectorsScorer); } - private static final IOContext DIRECT_IO_CONTEXT = new IOContext() { + @Override + public FlatVectorsReader fieldsReader(SegmentReadState state) throws IOException { + // only override the context for the random-access use case + SegmentReadState directIOState = state.context.context() == IOContext.Context.DEFAULT + ? new SegmentReadState( + state.directory, + state.segmentInfo, + state.fieldInfos, + new DirectIOContext(state.context.hints()), + state.segmentSuffix + ) + : state; + // Use mmap for merges and direct I/O for searches. + // TODO: Open the mmap file with sequential access instead of random (current behavior). + return new MergeReaderWrapper( + new Lucene99FlatVectorsReader(directIOState, vectorsScorer), + new Lucene99FlatVectorsReader(state, vectorsScorer) + ); + } + + @Override + public String toString() { + return "Lucene99FlatVectorsFormat(" + "vectorsScorer=" + vectorsScorer + ')'; + } + + static class DirectIOContext implements IOContext { + + final Set hints; + + DirectIOContext(Set hints) { + // always add DirectIOHint to the hints given + this.hints = Sets.union(hints, Set.of(DirectIOHint.INSTANCE)); + } + @Override public Context context() { return Context.DEFAULT; @@ -85,44 +117,12 @@ public FlushInfo flushInfo() { @Override public Set hints() { - return Set.of(DirectIOHint.INSTANCE); + return hints; } @Override public IOContext withHints(FileOpenHint... hints) { - return this; - } - - @Override - public Optional readAdvice() { - return Optional.empty(); - } - - @Override - public IOContext withReadAdvice(ReadAdvice advice) { - return this; + return new DirectIOContext(Set.of(hints)); } - }; - - @Override - public FlatVectorsReader fieldsReader(SegmentReadState state) throws IOException { - SegmentReadState directIOState = new SegmentReadState( - state.directory, - state.segmentInfo, - state.fieldInfos, - DIRECT_IO_CONTEXT, - state.segmentSuffix - ); - // Use mmap for merges and direct I/O for searches. - // TODO: Open the mmap file with sequential access instead of random (current behavior). - return new MergeReaderWrapper( - new Lucene99FlatVectorsReader(directIOState, vectorsScorer), - new Lucene99FlatVectorsReader(state, vectorsScorer) - ); - } - - @Override - public String toString() { - return "Lucene99FlatVectorsFormat(" + "vectorsScorer=" + vectorsScorer + ')'; } } diff --git a/server/src/main/java/org/elasticsearch/index/codec/vectors/es818/ES818BinaryQuantizedVectorsReader.java b/server/src/main/java/org/elasticsearch/index/codec/vectors/es818/ES818BinaryQuantizedVectorsReader.java index 8e547f9b3fa20..6d7eee650e902 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/vectors/es818/ES818BinaryQuantizedVectorsReader.java +++ b/server/src/main/java/org/elasticsearch/index/codec/vectors/es818/ES818BinaryQuantizedVectorsReader.java @@ -35,9 +35,11 @@ import org.apache.lucene.search.KnnCollector; import org.apache.lucene.search.VectorScorer; import org.apache.lucene.store.ChecksumIndexInput; +import org.apache.lucene.store.DataAccessHint; +import org.apache.lucene.store.FileDataHint; +import org.apache.lucene.store.FileTypeHint; import org.apache.lucene.store.IOContext; import org.apache.lucene.store.IndexInput; -import org.apache.lucene.store.ReadAdvice; import org.apache.lucene.util.Bits; import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.RamUsageEstimator; @@ -107,7 +109,7 @@ public class ES818BinaryQuantizedVectorsReader extends FlatVectorsReader { ES818BinaryQuantizedVectorsFormat.VECTOR_DATA_CODEC_NAME, // Quantized vectors are accessed randomly from their node ID stored in the HNSW // graph. - state.context.withReadAdvice(ReadAdvice.RANDOM) + state.context.withHints(FileTypeHint.DATA, FileDataHint.KNN_VECTORS, DataAccessHint.RANDOM) ); success = true; } finally { diff --git a/server/src/main/java/org/elasticsearch/index/store/Store.java b/server/src/main/java/org/elasticsearch/index/store/Store.java index af28bc3bb32d3..d4f2fcb23baab 100644 --- a/server/src/main/java/org/elasticsearch/index/store/Store.java +++ b/server/src/main/java/org/elasticsearch/index/store/Store.java @@ -27,13 +27,13 @@ import org.apache.lucene.store.BufferedChecksum; import org.apache.lucene.store.ByteArrayDataInput; import org.apache.lucene.store.ChecksumIndexInput; +import org.apache.lucene.store.DataAccessHint; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IOContext; import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.IndexOutput; import org.apache.lucene.store.Lock; import org.apache.lucene.store.NIOFSDirectory; -import org.apache.lucene.store.ReadAdvice; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.Version; @@ -153,7 +153,7 @@ public class Store extends AbstractIndexShardComponent implements Closeable, Ref // while equivalent, these different read once contexts are checked by identity in directory implementations private static IOContext createReadOnceContext() { - var context = IOContext.READONCE.withReadAdvice(ReadAdvice.SEQUENTIAL); + var context = IOContext.READONCE.withHints(DataAccessHint.SEQUENTIAL); assert context != IOContext.READONCE; assert context.equals(IOContext.READONCE); return context; diff --git a/server/src/main/resources/META-INF/services/org.apache.lucene.codecs.Codec b/server/src/main/resources/META-INF/services/org.apache.lucene.codecs.Codec index 1fbdaea9c772a..971db6dcc032c 100644 --- a/server/src/main/resources/META-INF/services/org.apache.lucene.codecs.Codec +++ b/server/src/main/resources/META-INF/services/org.apache.lucene.codecs.Codec @@ -2,3 +2,4 @@ org.elasticsearch.index.codec.Elasticsearch814Codec org.elasticsearch.index.codec.Elasticsearch816Codec org.elasticsearch.index.codec.Elasticsearch900Codec org.elasticsearch.index.codec.Elasticsearch900Lucene101Codec +org.elasticsearch.index.codec.Elasticsearch92Lucene103Codec diff --git a/server/src/test/java/org/elasticsearch/action/admin/indices/diskusage/IndexDiskUsageAnalyzerTests.java b/server/src/test/java/org/elasticsearch/action/admin/indices/diskusage/IndexDiskUsageAnalyzerTests.java index fdcfe5e3720f8..58b847d7a87a1 100644 --- a/server/src/test/java/org/elasticsearch/action/admin/indices/diskusage/IndexDiskUsageAnalyzerTests.java +++ b/server/src/test/java/org/elasticsearch/action/admin/indices/diskusage/IndexDiskUsageAnalyzerTests.java @@ -12,7 +12,7 @@ import org.apache.lucene.codecs.DocValuesFormat; import org.apache.lucene.codecs.KnnVectorsFormat; import org.apache.lucene.codecs.PostingsFormat; -import org.apache.lucene.codecs.lucene101.Lucene101Codec; +import org.apache.lucene.codecs.lucene103.Lucene103Codec; import org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat; import org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat; import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat; @@ -326,7 +326,7 @@ public void testTriangle() throws Exception { public void testCompletionField() throws Exception { IndexWriterConfig config = new IndexWriterConfig().setCommitOnClose(true) .setUseCompoundFile(false) - .setCodec(new Lucene101Codec(Lucene101Codec.Mode.BEST_SPEED) { + .setCodec(new Lucene103Codec(Lucene103Codec.Mode.BEST_SPEED) { @Override public PostingsFormat getPostingsFormatForField(String field) { if (field.startsWith("suggest_")) { @@ -432,25 +432,25 @@ private static void addFieldsToDoc(Document doc, IndexableField[] fields) { enum CodecMode { BEST_SPEED { @Override - Lucene101Codec.Mode mode() { - return Lucene101Codec.Mode.BEST_SPEED; + Lucene103Codec.Mode mode() { + return Lucene103Codec.Mode.BEST_SPEED; } }, BEST_COMPRESSION { @Override - Lucene101Codec.Mode mode() { - return Lucene101Codec.Mode.BEST_COMPRESSION; + Lucene103Codec.Mode mode() { + return Lucene103Codec.Mode.BEST_COMPRESSION; } }; - abstract Lucene101Codec.Mode mode(); + abstract Lucene103Codec.Mode mode(); } static void indexRandomly(Directory directory, CodecMode codecMode, int numDocs, Consumer addFields) throws IOException { IndexWriterConfig config = new IndexWriterConfig().setCommitOnClose(true) .setUseCompoundFile(randomBoolean()) - .setCodec(new Lucene101Codec(codecMode.mode())); + .setCodec(new Lucene103Codec(codecMode.mode())); try (IndexWriter writer = new IndexWriter(directory, config)) { for (int i = 0; i < numDocs; i++) { final Document doc = new Document(); @@ -662,7 +662,7 @@ static void rewriteIndexWithPerFieldCodec(Directory source, CodecMode mode, Dire try (DirectoryReader reader = DirectoryReader.open(source)) { IndexWriterConfig config = new IndexWriterConfig().setSoftDeletesField(Lucene.SOFT_DELETES_FIELD) .setUseCompoundFile(randomBoolean()) - .setCodec(new Lucene101Codec(mode.mode()) { + .setCodec(new Lucene103Codec(mode.mode()) { @Override public PostingsFormat getPostingsFormatForField(String field) { return new ES812PostingsFormat(); diff --git a/server/src/test/java/org/elasticsearch/common/lucene/search/function/MinScoreScorerTests.java b/server/src/test/java/org/elasticsearch/common/lucene/search/function/MinScoreScorerTests.java index 55ca666d8588b..b44c9e06bc059 100644 --- a/server/src/test/java/org/elasticsearch/common/lucene/search/function/MinScoreScorerTests.java +++ b/server/src/test/java/org/elasticsearch/common/lucene/search/function/MinScoreScorerTests.java @@ -142,7 +142,7 @@ public float getMaxScore(int upTo) throws IOException { random(), new ScoreMode[] { ScoreMode.COMPLETE, ScoreMode.TOP_SCORES, ScoreMode.TOP_DOCS_WITH_SCORES } ); - final Scorer assertingScorer = AssertingScorer.wrap(random(), scorer, scoreMode, true); + final Scorer assertingScorer = AssertingScorer.wrap(scorer, scoreMode.needsScores(), true); if (twoPhase && randomBoolean()) { return hideTwoPhaseIterator(assertingScorer); } else { diff --git a/server/src/test/java/org/elasticsearch/index/codec/CodecTests.java b/server/src/test/java/org/elasticsearch/index/codec/CodecTests.java index 23ee616d54231..a2ff440facaf0 100644 --- a/server/src/test/java/org/elasticsearch/index/codec/CodecTests.java +++ b/server/src/test/java/org/elasticsearch/index/codec/CodecTests.java @@ -50,7 +50,7 @@ public void testResolveDefaultCodecs() throws Exception { assumeTrue("Only when zstd_stored_fields feature flag is enabled", CodecService.ZSTD_STORED_FIELDS_FEATURE_FLAG); CodecService codecService = createCodecService(); assertThat(codecService.codec("default"), instanceOf(PerFieldMapperCodec.class)); - assertThat(codecService.codec("default"), instanceOf(Elasticsearch900Lucene101Codec.class)); + assertThat(codecService.codec("default"), instanceOf(Elasticsearch92Lucene103Codec.class)); } public void testDefault() throws Exception { diff --git a/server/src/test/java/org/elasticsearch/index/codec/PerFieldMapperCodecTests.java b/server/src/test/java/org/elasticsearch/index/codec/PerFieldMapperCodecTests.java index 0941ac17d6a90..2afddca3f0876 100644 --- a/server/src/test/java/org/elasticsearch/index/codec/PerFieldMapperCodecTests.java +++ b/server/src/test/java/org/elasticsearch/index/codec/PerFieldMapperCodecTests.java @@ -10,7 +10,7 @@ package org.elasticsearch.index.codec; import org.apache.lucene.codecs.PostingsFormat; -import org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat; +import org.apache.lucene.codecs.lucene103.Lucene103PostingsFormat; import org.elasticsearch.cluster.metadata.IndexMetadata; import org.elasticsearch.common.compress.CompressedXContent; import org.elasticsearch.common.settings.Settings; @@ -94,8 +94,8 @@ public void testUseBloomFilter() throws IOException { assertThat(perFieldMapperCodec.getPostingsFormatForField("_id"), instanceOf(ES87BloomFilterPostingsFormat.class)); assertThat(perFieldMapperCodec.useBloomFilter("another_field"), is(false)); - Class expectedPostingsFormat = PerFieldFormatSupplier.USE_LUCENE101_POSTINGS_FORMAT.isEnabled() - && timeSeries == false ? Lucene101PostingsFormat.class : ES812PostingsFormat.class; + Class expectedPostingsFormat = PerFieldFormatSupplier.USE_DEFAULT_LUCENE_POSTINGS_FORMAT.isEnabled() + && timeSeries == false ? Lucene103PostingsFormat.class : ES812PostingsFormat.class; assertThat(perFieldMapperCodec.getPostingsFormatForField("another_field"), instanceOf(expectedPostingsFormat)); } @@ -110,8 +110,8 @@ public void testUseBloomFilterWithTimestampFieldEnabled() throws IOException { public void testUseBloomFilterWithTimestampFieldEnabled_noTimeSeriesMode() throws IOException { PerFieldFormatSupplier perFieldMapperCodec = createFormatSupplier(true, false, false); assertThat(perFieldMapperCodec.useBloomFilter("_id"), is(false)); - Class expectedPostingsFormat = PerFieldFormatSupplier.USE_LUCENE101_POSTINGS_FORMAT.isEnabled() - ? Lucene101PostingsFormat.class + Class expectedPostingsFormat = PerFieldFormatSupplier.USE_DEFAULT_LUCENE_POSTINGS_FORMAT.isEnabled() + ? Lucene103PostingsFormat.class : ES812PostingsFormat.class; assertThat(perFieldMapperCodec.getPostingsFormatForField("_id"), instanceOf(expectedPostingsFormat)); } diff --git a/server/src/test/java/org/elasticsearch/index/codec/postings/ES812PostingsFormatTests.java b/server/src/test/java/org/elasticsearch/index/codec/postings/ES812PostingsFormatTests.java index b11ab47102288..f59e075d6ec5a 100644 --- a/server/src/test/java/org/elasticsearch/index/codec/postings/ES812PostingsFormatTests.java +++ b/server/src/test/java/org/elasticsearch/index/codec/postings/ES812PostingsFormatTests.java @@ -19,10 +19,10 @@ */ package org.elasticsearch.index.codec.postings; +import org.apache.lucene.backward_codecs.lucene90.blocktree.FieldReader; +import org.apache.lucene.backward_codecs.lucene90.blocktree.Stats; import org.apache.lucene.codecs.Codec; import org.apache.lucene.codecs.CompetitiveImpactAccumulator; -import org.apache.lucene.codecs.lucene90.blocktree.FieldReader; -import org.apache.lucene.codecs.lucene90.blocktree.Stats; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.DirectoryReader; diff --git a/server/src/test/java/org/elasticsearch/index/codec/tsdb/DocValuesCodecDuelTests.java b/server/src/test/java/org/elasticsearch/index/codec/tsdb/DocValuesCodecDuelTests.java index f0ce28f11a51a..e711f032a9f68 100644 --- a/server/src/test/java/org/elasticsearch/index/codec/tsdb/DocValuesCodecDuelTests.java +++ b/server/src/test/java/org/elasticsearch/index/codec/tsdb/DocValuesCodecDuelTests.java @@ -26,7 +26,7 @@ import org.apache.lucene.tests.index.RandomIndexWriter; import org.apache.lucene.tests.util.TestUtil; import org.apache.lucene.util.BytesRef; -import org.elasticsearch.index.codec.Elasticsearch900Lucene101Codec; +import org.elasticsearch.index.codec.Elasticsearch92Lucene103Codec; import org.elasticsearch.index.codec.tsdb.ES87TSDBDocValuesFormatTests.TestES87TSDBDocValuesFormat; import org.elasticsearch.index.codec.tsdb.es819.ES819TSDBDocValuesFormat; import org.elasticsearch.test.ESTestCase; @@ -55,7 +55,7 @@ public void testDuel() throws IOException { baselineConfig.setCodec(TestUtil.alwaysDocValuesFormat(new Lucene90DocValuesFormat())); var contenderConf = newIndexWriterConfig(); contenderConf.setMergePolicy(mergePolicy); - Codec codec = new Elasticsearch900Lucene101Codec() { + Codec codec = new Elasticsearch92Lucene103Codec() { final DocValuesFormat docValuesFormat = randomBoolean() ? new ES819TSDBDocValuesFormat() diff --git a/server/src/test/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesFormatTests.java b/server/src/test/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesFormatTests.java index a219ebb3740cc..f9e3313d012c4 100644 --- a/server/src/test/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesFormatTests.java +++ b/server/src/test/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesFormatTests.java @@ -12,7 +12,6 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.codecs.Codec; import org.apache.lucene.codecs.DocValuesConsumer; -import org.apache.lucene.codecs.DocValuesFormat; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.SortedDocValuesField; @@ -34,9 +33,9 @@ import org.apache.lucene.tests.analysis.MockAnalyzer; import org.apache.lucene.tests.index.BaseDocValuesFormatTestCase; import org.apache.lucene.tests.index.RandomIndexWriter; +import org.apache.lucene.tests.util.TestUtil; import org.apache.lucene.util.BytesRef; import org.elasticsearch.common.logging.LogConfigurator; -import org.elasticsearch.index.codec.Elasticsearch900Lucene101Codec; import java.io.IOException; import java.util.ArrayList; @@ -53,7 +52,6 @@ public class ES87TSDBDocValuesFormatTests extends BaseDocValuesFormatTestCase { private static final int NUM_DOCS = 10; static { - // For Elasticsearch900Lucene101Codec: LogConfigurator.loadLog4jPlugins(); LogConfigurator.configureESLogging(); } @@ -74,13 +72,7 @@ public DocValuesConsumer fieldsConsumer(SegmentWriteState state) throws IOExcept } } - private final Codec codec = new Elasticsearch900Lucene101Codec() { - - @Override - public DocValuesFormat getDocValuesFormatForField(String field) { - return new TestES87TSDBDocValuesFormat(); - } - }; + private final Codec codec = TestUtil.alwaysDocValuesFormat(new TestES87TSDBDocValuesFormat()); @Override protected Codec getCodec() { diff --git a/server/src/test/java/org/elasticsearch/index/codec/tsdb/TsdbDocValueBwcTests.java b/server/src/test/java/org/elasticsearch/index/codec/tsdb/TsdbDocValueBwcTests.java index 9c41e7a80ed66..f9426347e7ca1 100644 --- a/server/src/test/java/org/elasticsearch/index/codec/tsdb/TsdbDocValueBwcTests.java +++ b/server/src/test/java/org/elasticsearch/index/codec/tsdb/TsdbDocValueBwcTests.java @@ -33,7 +33,7 @@ import org.elasticsearch.cluster.metadata.DataStream; import org.elasticsearch.core.SuppressForbidden; import org.elasticsearch.index.codec.Elasticsearch816Codec; -import org.elasticsearch.index.codec.Elasticsearch900Lucene101Codec; +import org.elasticsearch.index.codec.Elasticsearch92Lucene103Codec; import org.elasticsearch.index.codec.perfield.XPerFieldDocValuesFormat; import org.elasticsearch.index.codec.tsdb.ES87TSDBDocValuesFormatTests.TestES87TSDBDocValuesFormat; import org.elasticsearch.index.codec.tsdb.es819.ES819TSDBDocValuesFormat; @@ -64,7 +64,7 @@ public DocValuesFormat getDocValuesFormatForField(String field) { return docValuesFormat; } }; - var newCodec = new Elasticsearch900Lucene101Codec() { + var newCodec = new Elasticsearch92Lucene103Codec() { final DocValuesFormat docValuesFormat = new ES819TSDBDocValuesFormat(); diff --git a/server/src/test/java/org/elasticsearch/index/codec/tsdb/es819/ES819TSDBDocValuesFormatTests.java b/server/src/test/java/org/elasticsearch/index/codec/tsdb/es819/ES819TSDBDocValuesFormatTests.java index 368d6f23d0fa1..ff27e1de5b82d 100644 --- a/server/src/test/java/org/elasticsearch/index/codec/tsdb/es819/ES819TSDBDocValuesFormatTests.java +++ b/server/src/test/java/org/elasticsearch/index/codec/tsdb/es819/ES819TSDBDocValuesFormatTests.java @@ -10,7 +10,6 @@ package org.elasticsearch.index.codec.tsdb.es819; import org.apache.lucene.codecs.Codec; -import org.apache.lucene.codecs.DocValuesFormat; import org.apache.lucene.document.BinaryDocValuesField; import org.apache.lucene.document.Document; import org.apache.lucene.document.NumericDocValuesField; @@ -25,9 +24,9 @@ import org.apache.lucene.search.Sort; import org.apache.lucene.search.SortField; import org.apache.lucene.search.SortedNumericSortField; +import org.apache.lucene.tests.util.TestUtil; import org.apache.lucene.util.BytesRef; import org.elasticsearch.cluster.metadata.DataStream; -import org.elasticsearch.index.codec.Elasticsearch900Lucene101Codec; import org.elasticsearch.index.codec.tsdb.ES87TSDBDocValuesFormatTests; import java.util.Arrays; @@ -35,15 +34,7 @@ public class ES819TSDBDocValuesFormatTests extends ES87TSDBDocValuesFormatTests { - private final Codec codec = new Elasticsearch900Lucene101Codec() { - - final ES819TSDBDocValuesFormat docValuesFormat = new ES819TSDBDocValuesFormat(); - - @Override - public DocValuesFormat getDocValuesFormatForField(String field) { - return docValuesFormat; - } - }; + final Codec codec = TestUtil.alwaysDocValuesFormat(new ES819TSDBDocValuesFormat()); @Override protected Codec getCodec() { diff --git a/server/src/test/java/org/elasticsearch/index/codec/vectors/ES813FlatVectorFormatTests.java b/server/src/test/java/org/elasticsearch/index/codec/vectors/ES813FlatVectorFormatTests.java index 9ea13750d8cd7..6b6c3056f4cb5 100644 --- a/server/src/test/java/org/elasticsearch/index/codec/vectors/ES813FlatVectorFormatTests.java +++ b/server/src/test/java/org/elasticsearch/index/codec/vectors/ES813FlatVectorFormatTests.java @@ -10,9 +10,7 @@ package org.elasticsearch.index.codec.vectors; import org.apache.lucene.codecs.Codec; -import org.apache.lucene.codecs.KnnVectorsFormat; import org.apache.lucene.codecs.KnnVectorsReader; -import org.apache.lucene.codecs.lucene101.Lucene101Codec; import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat; import org.apache.lucene.document.Document; import org.apache.lucene.document.KnnFloatVectorField; @@ -23,6 +21,7 @@ import org.apache.lucene.index.LeafReader; import org.apache.lucene.store.Directory; import org.apache.lucene.tests.index.BaseKnnVectorsFormatTestCase; +import org.apache.lucene.tests.util.TestUtil; import org.elasticsearch.common.logging.LogConfigurator; import java.io.IOException; @@ -36,14 +35,11 @@ public class ES813FlatVectorFormatTests extends BaseKnnVectorsFormatTestCase { LogConfigurator.configureESLogging(); // native access requires logging to be initialized } + final Codec codec = TestUtil.alwaysKnnVectorsFormat(new ES813FlatVectorFormat()); + @Override protected Codec getCodec() { - return new Lucene101Codec() { - @Override - public KnnVectorsFormat getKnnVectorsFormatForField(String field) { - return new ES813FlatVectorFormat(); - } - }; + return codec; } public void testSearchWithVisitedLimit() { diff --git a/server/src/test/java/org/elasticsearch/index/codec/vectors/ES813Int8FlatVectorFormatTests.java b/server/src/test/java/org/elasticsearch/index/codec/vectors/ES813Int8FlatVectorFormatTests.java index 6a29bf08ae9a5..402f793ed7987 100644 --- a/server/src/test/java/org/elasticsearch/index/codec/vectors/ES813Int8FlatVectorFormatTests.java +++ b/server/src/test/java/org/elasticsearch/index/codec/vectors/ES813Int8FlatVectorFormatTests.java @@ -10,9 +10,7 @@ package org.elasticsearch.index.codec.vectors; import org.apache.lucene.codecs.Codec; -import org.apache.lucene.codecs.KnnVectorsFormat; import org.apache.lucene.codecs.KnnVectorsReader; -import org.apache.lucene.codecs.lucene101.Lucene101Codec; import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat; import org.apache.lucene.document.Document; import org.apache.lucene.document.KnnFloatVectorField; @@ -23,6 +21,7 @@ import org.apache.lucene.index.LeafReader; import org.apache.lucene.store.Directory; import org.apache.lucene.tests.index.BaseKnnVectorsFormatTestCase; +import org.apache.lucene.tests.util.TestUtil; import org.elasticsearch.common.logging.LogConfigurator; import java.io.IOException; @@ -36,14 +35,11 @@ public class ES813Int8FlatVectorFormatTests extends BaseKnnVectorsFormatTestCase LogConfigurator.configureESLogging(); // native access requires logging to be initialized } + final Codec codec = TestUtil.alwaysKnnVectorsFormat(new ES813Int8FlatVectorFormat()); + @Override protected Codec getCodec() { - return new Lucene101Codec() { - @Override - public KnnVectorsFormat getKnnVectorsFormatForField(String field) { - return new ES813Int8FlatVectorFormat(); - } - }; + return codec; } public void testSearchWithVisitedLimit() { diff --git a/server/src/test/java/org/elasticsearch/index/codec/vectors/ES814HnswScalarQuantizedVectorsFormatTests.java b/server/src/test/java/org/elasticsearch/index/codec/vectors/ES814HnswScalarQuantizedVectorsFormatTests.java index b064321a70ae8..355dd07934eca 100644 --- a/server/src/test/java/org/elasticsearch/index/codec/vectors/ES814HnswScalarQuantizedVectorsFormatTests.java +++ b/server/src/test/java/org/elasticsearch/index/codec/vectors/ES814HnswScalarQuantizedVectorsFormatTests.java @@ -10,9 +10,7 @@ package org.elasticsearch.index.codec.vectors; import org.apache.lucene.codecs.Codec; -import org.apache.lucene.codecs.KnnVectorsFormat; import org.apache.lucene.codecs.KnnVectorsReader; -import org.apache.lucene.codecs.lucene101.Lucene101Codec; import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; @@ -29,6 +27,7 @@ import org.apache.lucene.store.Directory; import org.apache.lucene.store.MMapDirectory; import org.apache.lucene.tests.index.BaseKnnVectorsFormatTestCase; +import org.apache.lucene.tests.util.TestUtil; import org.elasticsearch.common.logging.LogConfigurator; import java.io.IOException; @@ -45,14 +44,11 @@ public class ES814HnswScalarQuantizedVectorsFormatTests extends BaseKnnVectorsFo LogConfigurator.configureESLogging(); // native access requires logging to be initialized } + final Codec codec = TestUtil.alwaysKnnVectorsFormat(new ES814HnswScalarQuantizedVectorsFormat()); + @Override protected Codec getCodec() { - return new Lucene101Codec() { - @Override - public KnnVectorsFormat getKnnVectorsFormatForField(String field) { - return new ES814HnswScalarQuantizedVectorsFormat(); - } - }; + return codec; } // The following test scenarios are similar to their superclass namesakes, diff --git a/server/src/test/java/org/elasticsearch/index/codec/vectors/ES815BitFlatVectorFormatTests.java b/server/src/test/java/org/elasticsearch/index/codec/vectors/ES815BitFlatVectorFormatTests.java index c770a62479d38..c65e43e100c8a 100644 --- a/server/src/test/java/org/elasticsearch/index/codec/vectors/ES815BitFlatVectorFormatTests.java +++ b/server/src/test/java/org/elasticsearch/index/codec/vectors/ES815BitFlatVectorFormatTests.java @@ -10,9 +10,7 @@ package org.elasticsearch.index.codec.vectors; import org.apache.lucene.codecs.Codec; -import org.apache.lucene.codecs.KnnVectorsFormat; import org.apache.lucene.codecs.KnnVectorsReader; -import org.apache.lucene.codecs.lucene101.Lucene101Codec; import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat; import org.apache.lucene.document.Document; import org.apache.lucene.document.KnnByteVectorField; @@ -23,20 +21,18 @@ import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.VectorSimilarityFunction; import org.apache.lucene.store.Directory; +import org.apache.lucene.tests.util.TestUtil; import org.junit.Before; import java.io.IOException; public class ES815BitFlatVectorFormatTests extends BaseKnnBitVectorsFormatTestCase { + final Codec codec = TestUtil.alwaysKnnVectorsFormat(new ES815BitFlatVectorFormat()); + @Override protected Codec getCodec() { - return new Lucene101Codec() { - @Override - public KnnVectorsFormat getKnnVectorsFormatForField(String field) { - return new ES815BitFlatVectorFormat(); - } - }; + return codec; } @Before diff --git a/server/src/test/java/org/elasticsearch/index/codec/vectors/ES815HnswBitVectorsFormatTests.java b/server/src/test/java/org/elasticsearch/index/codec/vectors/ES815HnswBitVectorsFormatTests.java index 544dccc73c070..fce4fdf748452 100644 --- a/server/src/test/java/org/elasticsearch/index/codec/vectors/ES815HnswBitVectorsFormatTests.java +++ b/server/src/test/java/org/elasticsearch/index/codec/vectors/ES815HnswBitVectorsFormatTests.java @@ -10,9 +10,7 @@ package org.elasticsearch.index.codec.vectors; import org.apache.lucene.codecs.Codec; -import org.apache.lucene.codecs.KnnVectorsFormat; import org.apache.lucene.codecs.KnnVectorsReader; -import org.apache.lucene.codecs.lucene101.Lucene101Codec; import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat; import org.apache.lucene.document.Document; import org.apache.lucene.document.KnnByteVectorField; @@ -23,20 +21,18 @@ import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.VectorSimilarityFunction; import org.apache.lucene.store.Directory; +import org.apache.lucene.tests.util.TestUtil; import org.junit.Before; import java.io.IOException; public class ES815HnswBitVectorsFormatTests extends BaseKnnBitVectorsFormatTestCase { + final Codec codec = TestUtil.alwaysKnnVectorsFormat(new ES815HnswBitVectorsFormat()); + @Override protected Codec getCodec() { - return new Lucene101Codec() { - @Override - public KnnVectorsFormat getKnnVectorsFormatForField(String field) { - return new ES815HnswBitVectorsFormat(); - } - }; + return codec; } @Before diff --git a/server/src/test/java/org/elasticsearch/index/codec/vectors/es816/ES816BinaryQuantizedVectorsFormatTests.java b/server/src/test/java/org/elasticsearch/index/codec/vectors/es816/ES816BinaryQuantizedVectorsFormatTests.java index 6aec7b595ce8b..07e2b0568c6fd 100644 --- a/server/src/test/java/org/elasticsearch/index/codec/vectors/es816/ES816BinaryQuantizedVectorsFormatTests.java +++ b/server/src/test/java/org/elasticsearch/index/codec/vectors/es816/ES816BinaryQuantizedVectorsFormatTests.java @@ -23,7 +23,6 @@ import org.apache.lucene.codecs.FilterCodec; import org.apache.lucene.codecs.KnnVectorsFormat; import org.apache.lucene.codecs.KnnVectorsReader; -import org.apache.lucene.codecs.lucene101.Lucene101Codec; import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat; import org.apache.lucene.document.Document; import org.apache.lucene.document.KnnFloatVectorField; @@ -43,6 +42,7 @@ import org.apache.lucene.search.TotalHits; import org.apache.lucene.store.Directory; import org.apache.lucene.tests.index.BaseKnnVectorsFormatTestCase; +import org.apache.lucene.tests.util.TestUtil; import org.elasticsearch.common.logging.LogConfigurator; import org.elasticsearch.index.codec.vectors.BQVectorUtils; @@ -62,14 +62,11 @@ public class ES816BinaryQuantizedVectorsFormatTests extends BaseKnnVectorsFormat LogConfigurator.configureESLogging(); // native access requires logging to be initialized } + final Codec codec = TestUtil.alwaysKnnVectorsFormat(new ES816BinaryQuantizedRWVectorsFormat()); + @Override protected Codec getCodec() { - return new Lucene101Codec() { - @Override - public KnnVectorsFormat getKnnVectorsFormatForField(String field) { - return new ES816BinaryQuantizedRWVectorsFormat(); - } - }; + return codec; } public void testSearch() throws Exception { diff --git a/server/src/test/java/org/elasticsearch/index/codec/vectors/es816/ES816HnswBinaryQuantizedVectorsFormatTests.java b/server/src/test/java/org/elasticsearch/index/codec/vectors/es816/ES816HnswBinaryQuantizedVectorsFormatTests.java index a80240e228efc..0bba6e7248068 100644 --- a/server/src/test/java/org/elasticsearch/index/codec/vectors/es816/ES816HnswBinaryQuantizedVectorsFormatTests.java +++ b/server/src/test/java/org/elasticsearch/index/codec/vectors/es816/ES816HnswBinaryQuantizedVectorsFormatTests.java @@ -23,7 +23,6 @@ import org.apache.lucene.codecs.FilterCodec; import org.apache.lucene.codecs.KnnVectorsFormat; import org.apache.lucene.codecs.KnnVectorsReader; -import org.apache.lucene.codecs.lucene101.Lucene101Codec; import org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsReader; import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat; import org.apache.lucene.document.Document; @@ -39,6 +38,7 @@ import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.Directory; import org.apache.lucene.tests.index.BaseKnnVectorsFormatTestCase; +import org.apache.lucene.tests.util.TestUtil; import org.apache.lucene.util.SameThreadExecutorService; import org.elasticsearch.common.logging.LogConfigurator; @@ -59,14 +59,11 @@ public class ES816HnswBinaryQuantizedVectorsFormatTests extends BaseKnnVectorsFo LogConfigurator.configureESLogging(); // native access requires logging to be initialized } + final Codec codec = TestUtil.alwaysKnnVectorsFormat(new ES816HnswBinaryQuantizedRWVectorsFormat()); + @Override protected Codec getCodec() { - return new Lucene101Codec() { - @Override - public KnnVectorsFormat getKnnVectorsFormatForField(String field) { - return new ES816HnswBinaryQuantizedRWVectorsFormat(); - } - }; + return codec; } public void testToString() { diff --git a/server/src/test/java/org/elasticsearch/index/codec/vectors/es818/ES818BinaryQuantizedVectorsFormatTests.java b/server/src/test/java/org/elasticsearch/index/codec/vectors/es818/ES818BinaryQuantizedVectorsFormatTests.java index 26bd905270f83..b52924e3f995c 100644 --- a/server/src/test/java/org/elasticsearch/index/codec/vectors/es818/ES818BinaryQuantizedVectorsFormatTests.java +++ b/server/src/test/java/org/elasticsearch/index/codec/vectors/es818/ES818BinaryQuantizedVectorsFormatTests.java @@ -23,7 +23,6 @@ import org.apache.lucene.codecs.FilterCodec; import org.apache.lucene.codecs.KnnVectorsFormat; import org.apache.lucene.codecs.KnnVectorsReader; -import org.apache.lucene.codecs.lucene101.Lucene101Codec; import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat; import org.apache.lucene.document.Document; import org.apache.lucene.document.KnnFloatVectorField; @@ -43,6 +42,7 @@ import org.apache.lucene.search.TotalHits; import org.apache.lucene.store.Directory; import org.apache.lucene.tests.index.BaseKnnVectorsFormatTestCase; +import org.apache.lucene.tests.util.TestUtil; import org.elasticsearch.common.logging.LogConfigurator; import org.elasticsearch.index.codec.vectors.BQVectorUtils; import org.elasticsearch.index.codec.vectors.OptimizedScalarQuantizer; @@ -63,14 +63,11 @@ public class ES818BinaryQuantizedVectorsFormatTests extends BaseKnnVectorsFormat LogConfigurator.configureESLogging(); // native access requires logging to be initialized } + final Codec codec = TestUtil.alwaysKnnVectorsFormat(new ES818BinaryQuantizedVectorsFormat()); + @Override protected Codec getCodec() { - return new Lucene101Codec() { - @Override - public KnnVectorsFormat getKnnVectorsFormatForField(String field) { - return new ES818BinaryQuantizedVectorsFormat(); - } - }; + return codec; } public void testSearch() throws Exception { diff --git a/server/src/test/java/org/elasticsearch/index/codec/vectors/es818/ES818HnswBinaryQuantizedVectorsFormatTests.java b/server/src/test/java/org/elasticsearch/index/codec/vectors/es818/ES818HnswBinaryQuantizedVectorsFormatTests.java index cf7b710ef3093..3ba7a13b5a59e 100644 --- a/server/src/test/java/org/elasticsearch/index/codec/vectors/es818/ES818HnswBinaryQuantizedVectorsFormatTests.java +++ b/server/src/test/java/org/elasticsearch/index/codec/vectors/es818/ES818HnswBinaryQuantizedVectorsFormatTests.java @@ -23,7 +23,6 @@ import org.apache.lucene.codecs.FilterCodec; import org.apache.lucene.codecs.KnnVectorsFormat; import org.apache.lucene.codecs.KnnVectorsReader; -import org.apache.lucene.codecs.lucene101.Lucene101Codec; import org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsReader; import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat; import org.apache.lucene.document.Document; @@ -39,6 +38,7 @@ import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.Directory; import org.apache.lucene.tests.index.BaseKnnVectorsFormatTestCase; +import org.apache.lucene.tests.util.TestUtil; import org.apache.lucene.util.SameThreadExecutorService; import org.elasticsearch.common.logging.LogConfigurator; @@ -59,14 +59,11 @@ public class ES818HnswBinaryQuantizedVectorsFormatTests extends BaseKnnVectorsFo LogConfigurator.configureESLogging(); // native access requires logging to be initialized } + final Codec codec = TestUtil.alwaysKnnVectorsFormat(new ES818HnswBinaryQuantizedVectorsFormat()); + @Override protected Codec getCodec() { - return new Lucene101Codec() { - @Override - public KnnVectorsFormat getKnnVectorsFormatForField(String field) { - return new ES818HnswBinaryQuantizedVectorsFormat(); - } - }; + return codec; } public void testToString() { diff --git a/server/src/test/java/org/elasticsearch/index/codec/zstd/StoredFieldCodecDuelTests.java b/server/src/test/java/org/elasticsearch/index/codec/zstd/StoredFieldCodecDuelTests.java index 0e5732ec09e5b..34b360d797930 100644 --- a/server/src/test/java/org/elasticsearch/index/codec/zstd/StoredFieldCodecDuelTests.java +++ b/server/src/test/java/org/elasticsearch/index/codec/zstd/StoredFieldCodecDuelTests.java @@ -10,7 +10,7 @@ package org.elasticsearch.index.codec.zstd; import org.apache.lucene.codecs.Codec; -import org.apache.lucene.codecs.lucene101.Lucene101Codec; +import org.apache.lucene.codecs.lucene103.Lucene103Codec; import org.apache.lucene.document.Document; import org.apache.lucene.document.StoredField; import org.apache.lucene.index.DirectoryReader; @@ -35,13 +35,13 @@ public class StoredFieldCodecDuelTests extends ESTestCase { private static final String DOUBLE_FIELD = "double_field_5"; public void testDuelBestSpeed() throws IOException { - var baseline = new LegacyPerFieldMapperCodec(Lucene101Codec.Mode.BEST_SPEED, null, BigArrays.NON_RECYCLING_INSTANCE); + var baseline = new LegacyPerFieldMapperCodec(Lucene103Codec.Mode.BEST_SPEED, null, BigArrays.NON_RECYCLING_INSTANCE); var contender = new PerFieldMapperCodec(Zstd814StoredFieldsFormat.Mode.BEST_SPEED, null, BigArrays.NON_RECYCLING_INSTANCE); doTestDuel(baseline, contender); } public void testDuelBestCompression() throws IOException { - var baseline = new LegacyPerFieldMapperCodec(Lucene101Codec.Mode.BEST_COMPRESSION, null, BigArrays.NON_RECYCLING_INSTANCE); + var baseline = new LegacyPerFieldMapperCodec(Lucene103Codec.Mode.BEST_COMPRESSION, null, BigArrays.NON_RECYCLING_INSTANCE); var contender = new PerFieldMapperCodec(Zstd814StoredFieldsFormat.Mode.BEST_COMPRESSION, null, BigArrays.NON_RECYCLING_INSTANCE); doTestDuel(baseline, contender); } diff --git a/server/src/test/java/org/elasticsearch/index/codec/zstd/Zstd814BestCompressionStoredFieldsFormatTests.java b/server/src/test/java/org/elasticsearch/index/codec/zstd/Zstd814BestCompressionStoredFieldsFormatTests.java index b6fefcb9a4e98..f89fa52256e15 100644 --- a/server/src/test/java/org/elasticsearch/index/codec/zstd/Zstd814BestCompressionStoredFieldsFormatTests.java +++ b/server/src/test/java/org/elasticsearch/index/codec/zstd/Zstd814BestCompressionStoredFieldsFormatTests.java @@ -11,11 +11,11 @@ import org.apache.lucene.codecs.Codec; import org.apache.lucene.tests.index.BaseStoredFieldsFormatTestCase; -import org.elasticsearch.index.codec.Elasticsearch900Lucene101Codec; +import org.elasticsearch.index.codec.Elasticsearch92Lucene103Codec; public class Zstd814BestCompressionStoredFieldsFormatTests extends BaseStoredFieldsFormatTestCase { - private final Codec codec = new Elasticsearch900Lucene101Codec(Zstd814StoredFieldsFormat.Mode.BEST_COMPRESSION); + private final Codec codec = new Elasticsearch92Lucene103Codec(Zstd814StoredFieldsFormat.Mode.BEST_COMPRESSION); @Override protected Codec getCodec() { diff --git a/server/src/test/java/org/elasticsearch/index/codec/zstd/Zstd814BestSpeedStoredFieldsFormatTests.java b/server/src/test/java/org/elasticsearch/index/codec/zstd/Zstd814BestSpeedStoredFieldsFormatTests.java index 98318707f6c4b..f3d120ed185e7 100644 --- a/server/src/test/java/org/elasticsearch/index/codec/zstd/Zstd814BestSpeedStoredFieldsFormatTests.java +++ b/server/src/test/java/org/elasticsearch/index/codec/zstd/Zstd814BestSpeedStoredFieldsFormatTests.java @@ -11,11 +11,11 @@ import org.apache.lucene.codecs.Codec; import org.apache.lucene.tests.index.BaseStoredFieldsFormatTestCase; -import org.elasticsearch.index.codec.Elasticsearch900Lucene101Codec; +import org.elasticsearch.index.codec.Elasticsearch92Lucene103Codec; public class Zstd814BestSpeedStoredFieldsFormatTests extends BaseStoredFieldsFormatTestCase { - private final Codec codec = new Elasticsearch900Lucene101Codec(Zstd814StoredFieldsFormat.Mode.BEST_SPEED); + private final Codec codec = new Elasticsearch92Lucene103Codec(Zstd814StoredFieldsFormat.Mode.BEST_SPEED); @Override protected Codec getCodec() { diff --git a/server/src/test/java/org/elasticsearch/index/engine/CompletionStatsCacheTests.java b/server/src/test/java/org/elasticsearch/index/engine/CompletionStatsCacheTests.java index 1343078906d6f..835a6457973fe 100644 --- a/server/src/test/java/org/elasticsearch/index/engine/CompletionStatsCacheTests.java +++ b/server/src/test/java/org/elasticsearch/index/engine/CompletionStatsCacheTests.java @@ -8,8 +8,6 @@ */ package org.elasticsearch.index.engine; -import org.apache.lucene.codecs.PostingsFormat; -import org.apache.lucene.codecs.lucene101.Lucene101Codec; import org.apache.lucene.document.Document; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexWriter; @@ -17,6 +15,7 @@ import org.apache.lucene.search.suggest.document.Completion101PostingsFormat; import org.apache.lucene.search.suggest.document.SuggestField; import org.apache.lucene.store.Directory; +import org.apache.lucene.tests.util.TestUtil; import org.elasticsearch.ElasticsearchException; import org.elasticsearch.core.IOUtils; import org.elasticsearch.index.cache.query.TrivialQueryCachingPolicy; @@ -44,13 +43,7 @@ public void testExceptionsAreNotCached() { public void testCompletionStatsCache() throws IOException, InterruptedException { final IndexWriterConfig indexWriterConfig = newIndexWriterConfig(); - final PostingsFormat postingsFormat = new Completion101PostingsFormat(); - indexWriterConfig.setCodec(new Lucene101Codec() { - @Override - public PostingsFormat getPostingsFormatForField(String field) { - return postingsFormat; // all fields are suggest fields - } - }); + indexWriterConfig.setCodec(TestUtil.alwaysPostingsFormat(new Completion101PostingsFormat())); try (Directory directory = newDirectory(); IndexWriter indexWriter = new IndexWriter(directory, indexWriterConfig)) { diff --git a/server/src/test/java/org/elasticsearch/index/mapper/DateFieldTypeTests.java b/server/src/test/java/org/elasticsearch/index/mapper/DateFieldTypeTests.java index 16c2c5ca5ddb8..aa444b58bc580 100644 --- a/server/src/test/java/org/elasticsearch/index/mapper/DateFieldTypeTests.java +++ b/server/src/test/java/org/elasticsearch/index/mapper/DateFieldTypeTests.java @@ -317,7 +317,7 @@ public void testRangeQuery() throws IOException { Query expected = new IndexOrDocValuesQuery( LongPoint.newRangeQuery("field", instant1, instant2), SortedNumericDocValuesField.newSlowRangeQuery("field", instant1, instant2) - ); + ).rewrite(newSearcher(new MultiReader())); assertEquals(expected, ft.rangeQuery(date1, date2, true, true, null, null, null, context).rewrite(newSearcher(new MultiReader()))); MappedFieldType ft2 = new DateFieldType("field", false); diff --git a/server/src/test/java/org/elasticsearch/search/query/QueryPhaseTests.java b/server/src/test/java/org/elasticsearch/search/query/QueryPhaseTests.java index 0d7f16211aa51..9f2fb0d91a1cc 100644 --- a/server/src/test/java/org/elasticsearch/search/query/QueryPhaseTests.java +++ b/server/src/test/java/org/elasticsearch/search/query/QueryPhaseTests.java @@ -815,7 +815,8 @@ public void testNumericSortOptimization() throws Exception { final SortAndFormats formatsLongDate = new SortAndFormats(sortLongDate, new DocValueFormat[] { DocValueFormat.RAW, dvFormatDate }); final SortAndFormats formatsDateLong = new SortAndFormats(sortDateLong, new DocValueFormat[] { dvFormatDate, DocValueFormat.RAW }); - Query q = LongPoint.newRangeQuery(fieldNameLong, startLongValue, startLongValue + numDocs); + // query all but one doc to avoid optimizations that may rewrite to a MatchAllDocs, which simplifies assertions + Query q = LongPoint.newRangeQuery(fieldNameLong, startLongValue, startLongValue + numDocs - 2); // 1. Test sort optimization on long field try (TestSearchContext searchContext = createContext(newContextSearcher(reader), q)) { @@ -883,7 +884,7 @@ public void testNumericSortOptimization() throws Exception { QueryPhase.addCollectorsAndSearch(searchContext); assertTrue(searchContext.sort().sort.getSort()[0].getOptimizeSortWithPoints()); assertThat(searchContext.queryResult().topDocs().topDocs.scoreDocs, arrayWithSize(0)); - assertThat(searchContext.queryResult().topDocs().topDocs.totalHits.value(), equalTo((long) numDocs)); + assertThat(searchContext.queryResult().topDocs().topDocs.totalHits.value(), equalTo((long) numDocs - 1)); assertThat(searchContext.queryResult().topDocs().topDocs.totalHits.relation(), equalTo(TotalHits.Relation.EQUAL_TO)); } @@ -994,8 +995,7 @@ public void testMinScore() throws Exception { QueryPhase.addCollectorsAndSearch(context); TotalHits totalHits = context.queryResult().topDocs().topDocs.totalHits; assertThat(totalHits.value(), greaterThanOrEqualTo(5L)); - var expectedRelation = totalHits.value() == 10 ? Relation.EQUAL_TO : Relation.GREATER_THAN_OR_EQUAL_TO; - assertThat(totalHits.relation(), is(expectedRelation)); + assertThat(totalHits.relation(), is(Relation.GREATER_THAN_OR_EQUAL_TO)); } } diff --git a/server/src/test/java/org/elasticsearch/search/vectors/RescoreKnnVectorQueryTests.java b/server/src/test/java/org/elasticsearch/search/vectors/RescoreKnnVectorQueryTests.java index 05b7bc9ef4f82..bab62994e2a06 100644 --- a/server/src/test/java/org/elasticsearch/search/vectors/RescoreKnnVectorQueryTests.java +++ b/server/src/test/java/org/elasticsearch/search/vectors/RescoreKnnVectorQueryTests.java @@ -29,7 +29,7 @@ import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.Weight; import org.apache.lucene.store.Directory; -import org.elasticsearch.index.codec.Elasticsearch900Lucene101Codec; +import org.elasticsearch.index.codec.Elasticsearch92Lucene103Codec; import org.elasticsearch.index.codec.vectors.ES813Int8FlatVectorFormat; import org.elasticsearch.index.codec.vectors.ES814HnswScalarQuantizedVectorsFormat; import org.elasticsearch.index.codec.vectors.es818.ES818BinaryQuantizedVectorsFormat; @@ -211,7 +211,7 @@ private static void addRandomDocuments(int numDocs, Directory d, int numDims) th new ES813Int8FlatVectorFormat(), new ES814HnswScalarQuantizedVectorsFormat() ); - iwc.setCodec(new Elasticsearch900Lucene101Codec(randomFrom(Zstd814StoredFieldsFormat.Mode.values())) { + iwc.setCodec(new Elasticsearch92Lucene103Codec(randomFrom(Zstd814StoredFieldsFormat.Mode.values())) { @Override public KnnVectorsFormat getKnnVectorsFormatForField(String field) { return format; diff --git a/test/test-clusters/src/main/java/org/elasticsearch/test/cluster/FeatureFlag.java b/test/test-clusters/src/main/java/org/elasticsearch/test/cluster/FeatureFlag.java index eea25b85b8548..49b79dc0a1350 100644 --- a/test/test-clusters/src/main/java/org/elasticsearch/test/cluster/FeatureFlag.java +++ b/test/test-clusters/src/main/java/org/elasticsearch/test/cluster/FeatureFlag.java @@ -19,7 +19,8 @@ public enum FeatureFlag { TIME_SERIES_MODE("es.index_mode_feature_flag_registered=true", Version.fromString("8.0.0"), null), SUB_OBJECTS_AUTO_ENABLED("es.sub_objects_auto_feature_flag_enabled=true", Version.fromString("8.16.0"), null), DOC_VALUES_SKIPPER("es.doc_values_skipper_feature_flag_enabled=true", Version.fromString("8.18.1"), null), - USE_LUCENE101_POSTINGS_FORMAT("es.use_lucene101_postings_format_feature_flag_enabled=true", Version.fromString("9.1.0"), null); + USE_LUCENE101_POSTINGS_FORMAT("es.use_lucene101_postings_format_feature_flag_enabled=true", Version.fromString("9.1.0"), null), + USE_LUCENE1013POSTINGS_FORMAT("es.use_lucene103_postings_format_feature_flag_enabled=true", Version.fromString("9.2.0"), null); public final String systemProperty; public final Version from;