From 6f5606cd1864a44c4703748941702d15b8211bea Mon Sep 17 00:00:00 2001 From: Tommaso Teofili Date: Tue, 29 Jul 2025 10:21:25 +0200 Subject: [PATCH 1/3] Expose mergePolicy in KnnIndexTester --- .../org/elasticsearch/test/knn/CmdLineArgs.java | 14 ++++++++++++-- .../elasticsearch/test/knn/KnnIndexTester.java | 17 ++++++++++++++++- .../org/elasticsearch/test/knn/KnnIndexer.java | 10 ++++++++-- 3 files changed, 36 insertions(+), 5 deletions(-) diff --git a/qa/vector/src/main/java/org/elasticsearch/test/knn/CmdLineArgs.java b/qa/vector/src/main/java/org/elasticsearch/test/knn/CmdLineArgs.java index f51c550e5292e..c4cd4e8b7bdc0 100644 --- a/qa/vector/src/main/java/org/elasticsearch/test/knn/CmdLineArgs.java +++ b/qa/vector/src/main/java/org/elasticsearch/test/knn/CmdLineArgs.java @@ -52,7 +52,8 @@ record CmdLineArgs( int quantizeBits, VectorEncoding vectorEncoding, int dimensions, - boolean earlyTermination + boolean earlyTermination, + String mergePolicy ) implements ToXContentObject { static final ParseField DOC_VECTORS_FIELD = new ParseField("doc_vectors"); @@ -79,6 +80,7 @@ record CmdLineArgs( static final ParseField EARLY_TERMINATION_FIELD = new ParseField("early_termination"); static final ParseField FILTER_SELECTIVITY_FIELD = new ParseField("filter_selectivity"); static final ParseField SEED_FIELD = new ParseField("seed"); + static final ParseField MERGE_POLICY_FIELD = new ParseField("merge_policy"); static CmdLineArgs fromXContent(XContentParser parser) throws IOException { Builder builder = PARSER.apply(parser, null); @@ -112,6 +114,7 @@ static CmdLineArgs fromXContent(XContentParser parser) throws IOException { PARSER.declareBoolean(Builder::setEarlyTermination, EARLY_TERMINATION_FIELD); PARSER.declareFloat(Builder::setFilterSelectivity, FILTER_SELECTIVITY_FIELD); PARSER.declareLong(Builder::setSeed, SEED_FIELD); + PARSER.declareString(Builder::setMergePolicy, MERGE_POLICY_FIELD); } @Override @@ -179,6 +182,7 @@ static class Builder { private boolean earlyTermination; private float filterSelectivity = 1f; private long seed = 1751900822751L; + private String mergePolicy = null; public Builder setDocVectors(List docVectors) { if (docVectors == null || docVectors.isEmpty()) { @@ -304,6 +308,11 @@ public Builder setSeed(long seed) { return this; } + public Builder setMergePolicy(String mergePolicy) { + this.mergePolicy = mergePolicy; + return this; + } + public CmdLineArgs build() { if (docVectors == null) { throw new IllegalArgumentException("Document vectors path must be provided"); @@ -337,7 +346,8 @@ public CmdLineArgs build() { quantizeBits, vectorEncoding, dimensions, - earlyTermination + earlyTermination, + mergePolicy ); } } diff --git a/qa/vector/src/main/java/org/elasticsearch/test/knn/KnnIndexTester.java b/qa/vector/src/main/java/org/elasticsearch/test/knn/KnnIndexTester.java index c4b0ccdfe35e3..a4ffe8fc5295d 100644 --- a/qa/vector/src/main/java/org/elasticsearch/test/knn/KnnIndexTester.java +++ b/qa/vector/src/main/java/org/elasticsearch/test/knn/KnnIndexTester.java @@ -15,6 +15,10 @@ import org.apache.lucene.codecs.KnnVectorsFormat; import org.apache.lucene.codecs.lucene101.Lucene101Codec; import org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat; +import org.apache.lucene.index.LogByteSizeMergePolicy; +import org.apache.lucene.index.MergePolicy; +import org.apache.lucene.index.NoMergePolicy; +import org.apache.lucene.index.TieredMergePolicy; import org.elasticsearch.cli.ProcessInfo; import org.elasticsearch.common.Strings; import org.elasticsearch.common.logging.LogConfigurator; @@ -196,6 +200,16 @@ public static void main(String[] args) throws Exception { logger.info("Running KNN index tester with arguments: " + cmdLineArgs); Codec codec = createCodec(cmdLineArgs); Path indexPath = PathUtils.get(formatIndexPath(cmdLineArgs)); + MergePolicy mergePolicy = null; + if (cmdLineArgs.mergePolicy() != null && cmdLineArgs.mergePolicy().isEmpty() == false) { + if ("tmp".equalsIgnoreCase(cmdLineArgs.mergePolicy())) { + mergePolicy = new TieredMergePolicy(); + } else if ("lbmp".equalsIgnoreCase(cmdLineArgs.mergePolicy())) { + mergePolicy = new LogByteSizeMergePolicy(); + } else if ("no".equalsIgnoreCase(cmdLineArgs.mergePolicy())) { + mergePolicy = NoMergePolicy.INSTANCE; + } + } if (cmdLineArgs.reindex() || cmdLineArgs.forceMerge()) { KnnIndexer knnIndexer = new KnnIndexer( cmdLineArgs.docVectors(), @@ -205,7 +219,8 @@ public static void main(String[] args) throws Exception { cmdLineArgs.vectorEncoding(), cmdLineArgs.dimensions(), cmdLineArgs.vectorSpace(), - cmdLineArgs.numDocs() + cmdLineArgs.numDocs(), + mergePolicy ); if (cmdLineArgs.reindex() == false && Files.exists(indexPath) == false) { throw new IllegalArgumentException("Index path does not exist: " + indexPath); diff --git a/qa/vector/src/main/java/org/elasticsearch/test/knn/KnnIndexer.java b/qa/vector/src/main/java/org/elasticsearch/test/knn/KnnIndexer.java index f7d00c9806c8d..aa8792bb2c4a5 100644 --- a/qa/vector/src/main/java/org/elasticsearch/test/knn/KnnIndexer.java +++ b/qa/vector/src/main/java/org/elasticsearch/test/knn/KnnIndexer.java @@ -31,6 +31,7 @@ import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.MergePolicy; import org.apache.lucene.index.VectorEncoding; import org.apache.lucene.index.VectorSimilarityFunction; import org.apache.lucene.store.FSDirectory; @@ -69,6 +70,7 @@ class KnnIndexer { private final Codec codec; private final int numDocs; private final int numIndexThreads; + private final MergePolicy mergePolicy; KnnIndexer( List docsPath, @@ -78,7 +80,8 @@ class KnnIndexer { VectorEncoding vectorEncoding, int dim, VectorSimilarityFunction similarityFunction, - int numDocs + int numDocs, + MergePolicy mergePolicy ) { this.docsPath = docsPath; this.indexPath = indexPath; @@ -88,6 +91,7 @@ class KnnIndexer { this.dim = dim; this.similarityFunction = similarityFunction; this.numDocs = numDocs; + this.mergePolicy = mergePolicy; } void numSegments(KnnIndexTester.Results result) { @@ -103,7 +107,9 @@ void createIndex(KnnIndexTester.Results result) throws IOException, InterruptedE iwc.setCodec(codec); iwc.setRAMBufferSizeMB(WRITER_BUFFER_MB); iwc.setUseCompoundFile(false); - + if (mergePolicy != null) { + iwc.setMergePolicy(mergePolicy); + } iwc.setMaxFullFlushMergeWaitMillis(0); iwc.setInfoStream(new PrintStreamInfoStream(System.out) { From db3d26253fb5e69a83e39de6e9670cd68e30b2d3 Mon Sep 17 00:00:00 2001 From: Tommaso Teofili Date: Tue, 29 Jul 2025 10:25:59 +0200 Subject: [PATCH 2/3] add ldmp --- .../main/java/org/elasticsearch/test/knn/KnnIndexTester.java | 3 +++ 1 file changed, 3 insertions(+) diff --git a/qa/vector/src/main/java/org/elasticsearch/test/knn/KnnIndexTester.java b/qa/vector/src/main/java/org/elasticsearch/test/knn/KnnIndexTester.java index a4ffe8fc5295d..d2d8b8ad7f0ed 100644 --- a/qa/vector/src/main/java/org/elasticsearch/test/knn/KnnIndexTester.java +++ b/qa/vector/src/main/java/org/elasticsearch/test/knn/KnnIndexTester.java @@ -16,6 +16,7 @@ import org.apache.lucene.codecs.lucene101.Lucene101Codec; import org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat; import org.apache.lucene.index.LogByteSizeMergePolicy; +import org.apache.lucene.index.LogDocMergePolicy; import org.apache.lucene.index.MergePolicy; import org.apache.lucene.index.NoMergePolicy; import org.apache.lucene.index.TieredMergePolicy; @@ -208,6 +209,8 @@ public static void main(String[] args) throws Exception { mergePolicy = new LogByteSizeMergePolicy(); } else if ("no".equalsIgnoreCase(cmdLineArgs.mergePolicy())) { mergePolicy = NoMergePolicy.INSTANCE; + } else if ("ldmp".equalsIgnoreCase(cmdLineArgs.mergePolicy())) { + mergePolicy = new LogDocMergePolicy(); } } if (cmdLineArgs.reindex() || cmdLineArgs.forceMerge()) { From 485ae9416d053f6f96152b640249b6fbfbace162 Mon Sep 17 00:00:00 2001 From: Tommaso Teofili Date: Tue, 29 Jul 2025 16:31:31 +0200 Subject: [PATCH 3/3] made merge_policy an enum --- .../elasticsearch/test/knn/CmdLineArgs.java | 6 +-- .../test/knn/KnnIndexTester.java | 38 +++++++++++++------ 2 files changed, 29 insertions(+), 15 deletions(-) diff --git a/qa/vector/src/main/java/org/elasticsearch/test/knn/CmdLineArgs.java b/qa/vector/src/main/java/org/elasticsearch/test/knn/CmdLineArgs.java index c4cd4e8b7bdc0..85fa02aecaaef 100644 --- a/qa/vector/src/main/java/org/elasticsearch/test/knn/CmdLineArgs.java +++ b/qa/vector/src/main/java/org/elasticsearch/test/knn/CmdLineArgs.java @@ -53,7 +53,7 @@ record CmdLineArgs( VectorEncoding vectorEncoding, int dimensions, boolean earlyTermination, - String mergePolicy + KnnIndexTester.MergePolicyType mergePolicy ) implements ToXContentObject { static final ParseField DOC_VECTORS_FIELD = new ParseField("doc_vectors"); @@ -182,7 +182,7 @@ static class Builder { private boolean earlyTermination; private float filterSelectivity = 1f; private long seed = 1751900822751L; - private String mergePolicy = null; + private KnnIndexTester.MergePolicyType mergePolicy = null; public Builder setDocVectors(List docVectors) { if (docVectors == null || docVectors.isEmpty()) { @@ -309,7 +309,7 @@ public Builder setSeed(long seed) { } public Builder setMergePolicy(String mergePolicy) { - this.mergePolicy = mergePolicy; + this.mergePolicy = KnnIndexTester.MergePolicyType.valueOf(mergePolicy.toUpperCase(Locale.ROOT)); return this; } diff --git a/qa/vector/src/main/java/org/elasticsearch/test/knn/KnnIndexTester.java b/qa/vector/src/main/java/org/elasticsearch/test/knn/KnnIndexTester.java index d2d8b8ad7f0ed..17257dcb73d59 100644 --- a/qa/vector/src/main/java/org/elasticsearch/test/knn/KnnIndexTester.java +++ b/qa/vector/src/main/java/org/elasticsearch/test/knn/KnnIndexTester.java @@ -74,6 +74,13 @@ enum IndexType { IVF } + enum MergePolicyType { + TIERED, + LOG_BYTE, + NO, + LOG_DOC + } + private static String formatIndexPath(CmdLineArgs args) { List suffix = new ArrayList<>(); if (args.indexType() == IndexType.FLAT) { @@ -201,18 +208,7 @@ public static void main(String[] args) throws Exception { logger.info("Running KNN index tester with arguments: " + cmdLineArgs); Codec codec = createCodec(cmdLineArgs); Path indexPath = PathUtils.get(formatIndexPath(cmdLineArgs)); - MergePolicy mergePolicy = null; - if (cmdLineArgs.mergePolicy() != null && cmdLineArgs.mergePolicy().isEmpty() == false) { - if ("tmp".equalsIgnoreCase(cmdLineArgs.mergePolicy())) { - mergePolicy = new TieredMergePolicy(); - } else if ("lbmp".equalsIgnoreCase(cmdLineArgs.mergePolicy())) { - mergePolicy = new LogByteSizeMergePolicy(); - } else if ("no".equalsIgnoreCase(cmdLineArgs.mergePolicy())) { - mergePolicy = NoMergePolicy.INSTANCE; - } else if ("ldmp".equalsIgnoreCase(cmdLineArgs.mergePolicy())) { - mergePolicy = new LogDocMergePolicy(); - } - } + MergePolicy mergePolicy = getMergePolicy(cmdLineArgs); if (cmdLineArgs.reindex() || cmdLineArgs.forceMerge()) { KnnIndexer knnIndexer = new KnnIndexer( cmdLineArgs.docVectors(), @@ -250,6 +246,24 @@ public static void main(String[] args) throws Exception { logger.info("Results: \n" + formattedResults); } + private static MergePolicy getMergePolicy(CmdLineArgs args) { + MergePolicy mergePolicy = null; + if (args.mergePolicy() != null) { + if (args.mergePolicy() == MergePolicyType.TIERED) { + mergePolicy = new TieredMergePolicy(); + } else if (args.mergePolicy() == MergePolicyType.LOG_BYTE) { + mergePolicy = new LogByteSizeMergePolicy(); + } else if (args.mergePolicy() == MergePolicyType.NO) { + mergePolicy = NoMergePolicy.INSTANCE; + } else if (args.mergePolicy() == MergePolicyType.LOG_DOC) { + mergePolicy = new LogDocMergePolicy(); + } else { + throw new IllegalArgumentException("Invalid merge policy: " + args.mergePolicy()); + } + } + return mergePolicy; + } + static class FormattedResults { List indexResults = new ArrayList<>(); List queryResults = new ArrayList<>();