elastic · przemekwitek · Jun 25, 2025 · Jul 21, 2025 · Jul 21, 2025 · Jul 21, 2025
diff --git a/benchmarks/README.md b/benchmarks/README.md
@@ -152,11 +152,10 @@ exit
 Grab the async profiler from https://github.com/jvm-profiling-tools/async-profiler
 and run `prof async` like so:
 ```
-gradlew -p benchmarks/ run --args 'LongKeyedBucketOrdsBenchmark.multiBucket -prof "async:libPath=/home/nik9000/Downloads/async-profiler-3.0-29ee888-linux-x64/lib/libasyncProfiler.so;dir=/tmp/prof;output=flamegraph"'
+gradlew -p benchmarks/ run --args 'LongKeyedBucketOrdsBenchmark.multiBucket -prof "async:libPath=/home/nik9000/Downloads/async-profiler-4.0-linux-x64/lib/libasyncProfiler.so;dir=/tmp/prof;output=flamegraph"'
 ```
 
-Note: As of January 2025 the latest release of async profiler doesn't work
-      with our JDK but the nightly is fine.
+Note: As of July 2025 the 4.0 release of the async profiler works well.
 
 If you are on Mac, this'll warn you that you downloaded the shared library from
 the internet. You'll need to go to settings and allow it to run.

diff --git a/...operator/ValuesSourceReaderBenchmark.java → ...tly/esql/ValuesSourceReaderBenchmark.java b/...operator/ValuesSourceReaderBenchmark.java → ...tly/esql/ValuesSourceReaderBenchmark.java
@@ -7,7 +7,7 @@
  * License v3.0 only", or the "Server Side Public License, v 1".
  */
 
-package org.elasticsearch.benchmark.compute.operator;
+package org.elasticsearch.benchmark._nightly.esql;
 
 import org.apache.lucene.document.FieldType;
 import org.apache.lucene.document.NumericDocValuesField;
@@ -24,8 +24,10 @@
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.NumericUtils;
 import org.elasticsearch.common.breaker.NoopCircuitBreaker;
+import org.elasticsearch.common.logging.LogConfigurator;
 import org.elasticsearch.common.lucene.Lucene;
 import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.common.unit.ByteSizeValue;
 import org.elasticsearch.common.util.BigArrays;
 import org.elasticsearch.compute.data.BlockFactory;
 import org.elasticsearch.compute.data.BytesRefBlock;
@@ -85,10 +87,23 @@
 @State(Scope.Thread)
 @Fork(1)
 public class ValuesSourceReaderBenchmark {
+    static {
+        LogConfigurator.configureESLogging();
+    }
+
+    private static final String[] SUPPORTED_LAYOUTS = new String[] { "in_order", "shuffled", "shuffled_singles" };
+    private static final String[] SUPPORTED_NAMES = new String[] {
+        "long",
+        "int",
+        "double",
+        "keyword",
+        "stored_keyword",
+        "3_stored_keywords",
+        "keyword_mv" };
+
     private static final int BLOCK_LENGTH = 16 * 1024;
     private static final int INDEX_SIZE = 10 * BLOCK_LENGTH;
     private static final int COMMIT_INTERVAL = 500;
-    private static final BigArrays BIG_ARRAYS = BigArrays.NON_RECYCLING_INSTANCE;
     private static final BlockFactory blockFactory = BlockFactory.getInstance(
         new NoopCircuitBreaker("noop"),
         BigArrays.NON_RECYCLING_INSTANCE
@@ -104,8 +119,8 @@ static void selfTest() {
             ValuesSourceReaderBenchmark benchmark = new ValuesSourceReaderBenchmark();
             benchmark.setupIndex();
             try {
-                for (String layout : ValuesSourceReaderBenchmark.class.getField("layout").getAnnotationsByType(Param.class)[0].value()) {
-                    for (String name : ValuesSourceReaderBenchmark.class.getField("name").getAnnotationsByType(Param.class)[0].value()) {
+                for (String layout : ValuesSourceReaderBenchmark.SUPPORTED_LAYOUTS) {
+                    for (String name : ValuesSourceReaderBenchmark.SUPPORTED_NAMES) {
                         benchmark.layout = layout;
                         benchmark.name = name;
                         try {
@@ -119,7 +134,7 @@ static void selfTest() {
             } finally {
                 benchmark.teardownIndex();
             }
-        } catch (IOException | NoSuchFieldException e) {
+        } catch (IOException e) {
             throw new AssertionError(e);
         }
     }
@@ -321,10 +336,10 @@ public FieldNamesFieldMapper.FieldNamesFieldType fieldNames() {
      *     each page has a single document rather than {@code BLOCK_SIZE} docs.</li>
      * </ul>
      */
-    @Param({ "in_order", "shuffled", "shuffled_singles" })
+    @Param({ "in_order", "shuffled" })
     public String layout;
 
-    @Param({ "long", "int", "double", "keyword", "stored_keyword", "3_stored_keywords" })
+    @Param({ "long", "keyword", "stored_keyword", "keyword_mv" })
     public String name;
 
     private Directory directory;
@@ -336,6 +351,7 @@ public FieldNamesFieldMapper.FieldNamesFieldType fieldNames() {
     public void benchmark() {
         ValuesSourceReaderOperator op = new ValuesSourceReaderOperator(
             blockFactory,
+            ByteSizeValue.ofMb(1).getBytes(),
             fields(name),
             List.of(new ValuesSourceReaderOperator.ShardContext(reader, () -> {
                 throw new UnsupportedOperationException("can't load _source here");
@@ -390,6 +406,22 @@ public void benchmark() {
                         }
                     }
                 }
+                case "keyword_mv" -> {
+                    BytesRef scratch = new BytesRef();
+                    BytesRefBlock values = op.getOutput().<BytesRefBlock>getBlock(1);
+                    for (int p = 0; p < values.getPositionCount(); p++) {
+                        int count = values.getValueCount(p);
+                        if (count > 0) {
+                            int first = values.getFirstValueIndex(p);
+                            for (int i = 0; i < count; i++) {
+                                BytesRef r = values.getBytesRef(first + i, scratch);
+                                r.offset++;
+                                r.length--;
+                                sum += Integer.parseInt(r.utf8ToString());
+                            }
+                        }
+                    }
+                }
             }
         }
         long expected = 0;
@@ -399,6 +431,16 @@ public void benchmark() {
                     expected += i % 1000;
                 }
                 break;
+            case "keyword_mv":
+                for (int i = 0; i < INDEX_SIZE; i++) {
+                    int v1 = i % 1000;
+                    expected += v1;
+                    int v2 = i % 500;
+                    if (v1 != v2) {
+                        expected += v2;
+                    }
+                }
+                break;
             case "3_stored_keywords":
                 for (int i = 0; i < INDEX_SIZE; i++) {
                     expected += 3 * (i % 1000);
@@ -453,7 +495,9 @@ private void setupIndex() throws IOException {
                         new StoredField("double", (double) i),
                         new KeywordFieldMapper.KeywordField("keyword_1", new BytesRef(c + i % 1000), keywordFieldType),
                         new KeywordFieldMapper.KeywordField("keyword_2", new BytesRef(c + i % 1000), keywordFieldType),
-                        new KeywordFieldMapper.KeywordField("keyword_3", new BytesRef(c + i % 1000), keywordFieldType)
+                        new KeywordFieldMapper.KeywordField("keyword_3", new BytesRef(c + i % 1000), keywordFieldType),
+                        new KeywordFieldMapper.KeywordField("keyword_mv", new BytesRef(c + i % 1000), keywordFieldType),
+                        new KeywordFieldMapper.KeywordField("keyword_mv", new BytesRef(c + i % 500), keywordFieldType)
                     )
                 );
                 if (i % COMMIT_INTERVAL == 0) {

diff --git a/...marks/src/main/java/org/elasticsearch/benchmark/compute/operator/AggregatorBenchmark.java b/...marks/src/main/java/org/elasticsearch/benchmark/compute/operator/AggregatorBenchmark.java
@@ -191,11 +191,12 @@ private static Operator operator(DriverContext driverContext, String grouping, S
                 new BlockHash.GroupSpec(2, ElementType.BYTES_REF)
             );
             case TOP_N_LONGS -> List.of(
-                new BlockHash.GroupSpec(0, ElementType.LONG, false, new BlockHash.TopNDef(0, true, true, TOP_N_LIMIT))
+                new BlockHash.GroupSpec(0, ElementType.LONG, null, new BlockHash.TopNDef(0, true, true, TOP_N_LIMIT), null)
             );
             default -> throw new IllegalArgumentException("unsupported grouping [" + grouping + "]");
         };
         return new HashAggregationOperator(
+            groups,
             List.of(supplier(op, dataType, filter).groupingAggregatorFactory(AggregatorMode.SINGLE, List.of(groups.size()))),
             () -> BlockHash.build(groups, driverContext.blockFactory(), 16 * 1024, false),
             driverContext

diff --git a/...src/main/java/org/elasticsearch/benchmark/compute/operator/ValuesAggregatorBenchmark.java b/...src/main/java/org/elasticsearch/benchmark/compute/operator/ValuesAggregatorBenchmark.java
@@ -95,8 +95,7 @@ static void selfTest() {
         try {
             for (String groups : ValuesAggregatorBenchmark.class.getField("groups").getAnnotationsByType(Param.class)[0].value()) {
                 for (String dataType : ValuesAggregatorBenchmark.class.getField("dataType").getAnnotationsByType(Param.class)[0].value()) {
-                    run(Integer.parseInt(groups), dataType, 10, 0);
-                    run(Integer.parseInt(groups), dataType, 10, 1);
+                    run(Integer.parseInt(groups), dataType, 10);
                 }
             }
         } catch (NoSuchFieldException e) {
@@ -114,10 +113,7 @@ static void selfTest() {
     @Param({ BYTES_REF, INT, LONG })
     public String dataType;
 
-    @Param({ "0", "1" })
-    public int numOrdinalMerges;
-
-    private static Operator operator(DriverContext driverContext, int groups, String dataType, int numOrdinalMerges) {
+    private static Operator operator(DriverContext driverContext, int groups, String dataType) {
         if (groups == 1) {
             return new AggregationOperator(
                 List.of(supplier(dataType).aggregatorFactory(AggregatorMode.SINGLE, List.of(0)).apply(driverContext)),
@@ -126,26 +122,15 @@ private static Operator operator(DriverContext driverContext, int groups, String
         }
         List<BlockHash.GroupSpec> groupSpec = List.of(new BlockHash.GroupSpec(0, ElementType.LONG));
         return new HashAggregationOperator(
+            groupSpec,
             List.of(supplier(dataType).groupingAggregatorFactory(AggregatorMode.SINGLE, List.of(1))),
             () -> BlockHash.build(groupSpec, driverContext.blockFactory(), 16 * 1024, false),
             driverContext
         ) {
             @Override
             public Page getOutput() {
-                mergeOrdinal();
                 return super.getOutput();
             }
-
-            // simulate OrdinalsGroupingOperator
-            void mergeOrdinal() {
-                var merged = supplier(dataType).groupingAggregatorFactory(AggregatorMode.SINGLE, List.of(1)).apply(driverContext);
-                for (int i = 0; i < numOrdinalMerges; i++) {
-                    for (int p = 0; p < groups; p++) {
-                        merged.addIntermediateRow(p, aggregators.getFirst(), p);
-                    }
-                }
-                aggregators.set(0, merged);
-            }
         };
     }
 
@@ -352,12 +337,12 @@ private static Block groupingBlock(int groups) {
 
     @Benchmark
     public void run() {
-        run(groups, dataType, OP_COUNT, numOrdinalMerges);
+        run(groups, dataType, OP_COUNT);
     }
 
-    private static void run(int groups, String dataType, int opCount, int numOrdinalMerges) {
+    private static void run(int groups, String dataType, int opCount) {
         DriverContext driverContext = driverContext();
-        try (Operator operator = operator(driverContext, groups, dataType, numOrdinalMerges)) {
+        try (Operator operator = operator(driverContext, groups, dataType)) {
             Page page = page(groups, dataType);
             for (int i = 0; i < opCount; i++) {
                 operator.addInput(page.shallowCopy());

diff --git a/benchmarks/src/main/java/org/elasticsearch/benchmark/vector/Int4ScorerBenchmark.java b/benchmarks/src/main/java/org/elasticsearch/benchmark/vector/Int4ScorerBenchmark.java
@@ -8,12 +8,14 @@
  */
 package org.elasticsearch.benchmark.vector;
 
+import org.apache.lucene.index.VectorSimilarityFunction;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.IOContext;
 import org.apache.lucene.store.IndexInput;
 import org.apache.lucene.store.IndexOutput;
 import org.apache.lucene.store.MMapDirectory;
 import org.apache.lucene.util.VectorUtil;
+import org.apache.lucene.util.quantization.OptimizedScalarQuantizer;
 import org.elasticsearch.common.logging.LogConfigurator;
 import org.elasticsearch.core.IOUtils;
 import org.elasticsearch.simdvec.ES91Int4VectorsScorer;
@@ -52,20 +54,26 @@ public class Int4ScorerBenchmark {
         LogConfigurator.configureESLogging(); // native access requires logging to be initialized
     }
 
-    @Param({ "384", "702", "1024" })
+    @Param({ "384", "782", "1024" })
     int dims;
 
-    int numVectors = 200;
-    int numQueries = 10;
+    int numVectors = 20 * ES91Int4VectorsScorer.BULK_SIZE;
+    int numQueries = 5;
 
     byte[] scratch;
     byte[][] binaryVectors;
     byte[][] binaryQueries;
+    float[] scores = new float[ES91Int4VectorsScorer.BULK_SIZE];
+
+    float[] scratchFloats = new float[3];
 
     ES91Int4VectorsScorer scorer;
     Directory dir;
     IndexInput in;
 
+    OptimizedScalarQuantizer.QuantizationResult queryCorrections;
+    float centroidDp;
+
     @Setup
     public void setup() throws IOException {
         binaryVectors = new byte[numVectors][dims];
@@ -77,9 +85,19 @@ public void setup() throws IOException {
                     binaryVector[i] = (byte) ThreadLocalRandom.current().nextInt(16);
                 }
                 out.writeBytes(binaryVector, 0, binaryVector.length);
+                ThreadLocalRandom.current().nextBytes(binaryVector);
+                out.writeBytes(binaryVector, 0, 14); // corrections
             }
         }
 
+        queryCorrections = new OptimizedScalarQuantizer.QuantizationResult(
+            ThreadLocalRandom.current().nextFloat(),
+            ThreadLocalRandom.current().nextFloat(),
+            ThreadLocalRandom.current().nextFloat(),
+            Short.toUnsignedInt((short) ThreadLocalRandom.current().nextInt())
+        );
+        centroidDp = ThreadLocalRandom.current().nextFloat();
+
         in = dir.openInput("vectors", IOContext.DEFAULT);
         binaryQueries = new byte[numVectors][dims];
         for (byte[] binaryVector : binaryVectors) {
@@ -105,18 +123,66 @@ public void scoreFromArray(Blackhole bh) throws IOException {
             in.seek(0);
             for (int i = 0; i < numVectors; i++) {
                 in.readBytes(scratch, 0, dims);
-                bh.consume(VectorUtil.int4DotProduct(binaryQueries[j], scratch));
+                int dp = VectorUtil.int4DotProduct(binaryQueries[j], scratch);
+                in.readFloats(scratchFloats, 0, 3);
+                float score = scorer.applyCorrections(
+                    queryCorrections.lowerInterval(),
+                    queryCorrections.upperInterval(),
+                    queryCorrections.quantizedComponentSum(),
+                    queryCorrections.additionalCorrection(),
+                    VectorSimilarityFunction.EUCLIDEAN,
+                    centroidDp, // assuming no centroid dot product for this benchmark
+                    scratchFloats[0],
+                    scratchFloats[1],
+                    Short.toUnsignedInt(in.readShort()),
+                    scratchFloats[2],
+                    dp
+                );
+                bh.consume(score);
             }
         }
     }
 
     @Benchmark
     @Fork(jvmArgsPrepend = { "--add-modules=jdk.incubator.vector" })
-    public void scoreFromMemorySegmentOnlyVector(Blackhole bh) throws IOException {
+    public void scoreFromMemorySegment(Blackhole bh) throws IOException {
         for (int j = 0; j < numQueries; j++) {
             in.seek(0);
             for (int i = 0; i < numVectors; i++) {
-                bh.consume(scorer.int4DotProduct(binaryQueries[j]));
+                bh.consume(
+                    scorer.score(
+                        binaryQueries[j],
+                        queryCorrections.lowerInterval(),
+                        queryCorrections.upperInterval(),
+                        queryCorrections.quantizedComponentSum(),
+                        queryCorrections.additionalCorrection(),
+                        VectorSimilarityFunction.EUCLIDEAN,
+                        centroidDp
+                    )
+                );
+            }
+        }
+    }
+
+    @Benchmark
+    @Fork(jvmArgsPrepend = { "--add-modules=jdk.incubator.vector" })
+    public void scoreFromMemorySegmentBulk(Blackhole bh) throws IOException {
+        for (int j = 0; j < numQueries; j++) {
+            in.seek(0);
+            for (int i = 0; i < numVectors; i += ES91Int4VectorsScorer.BULK_SIZE) {
+                scorer.scoreBulk(
+                    binaryQueries[j],
+                    queryCorrections.lowerInterval(),
+                    queryCorrections.upperInterval(),
+                    queryCorrections.quantizedComponentSum(),
+                    queryCorrections.additionalCorrection(),
+                    VectorSimilarityFunction.EUCLIDEAN,
+                    centroidDp,
+                    scores
+                );
+                for (float score : scores) {
+                    bh.consume(score);
+                }
             }
         }
     }

diff --git a/...tor/ValuesSourceReaderBenchmarkTests.java → ...sql/ValuesSourceReaderBenchmarkTests.java b/...tor/ValuesSourceReaderBenchmarkTests.java → ...sql/ValuesSourceReaderBenchmarkTests.java
@@ -7,7 +7,7 @@
  * License v3.0 only", or the "Server Side Public License, v 1".
  */
 
-package org.elasticsearch.benchmark.compute.operator;
+package org.elasticsearch.benchmark._nightly.esql;
 
 import org.elasticsearch.test.ESTestCase;