elastic · nik9000 · Apr 11, 2025 · Mar 26, 2025 · Mar 26, 2025 · Mar 27, 2025
diff --git a/docs/changelog/125739.yaml b/docs/changelog/125739.yaml
@@ -0,0 +1,5 @@
+pr: 125739
+summary: Heuristics to pick efficient partitioning
+area: ES|QL
+type: enhancement
+issues: []
diff --git a/server/src/main/java/org/elasticsearch/TransportVersions.java b/server/src/main/java/org/elasticsearch/TransportVersions.java
@@ -207,6 +207,7 @@ static TransportVersion def(int id) {
     public static final TransportVersion RESCORE_VECTOR_ALLOW_ZERO = def(9_039_0_00);
     public static final TransportVersion PROJECT_ID_IN_SNAPSHOT = def(9_040_0_00);
     public static final TransportVersion INDEX_STATS_AND_METADATA_INCLUDE_PEAK_WRITE_LOAD = def(9_041_0_00);
+    public static final TransportVersion ESQL_REPORT_SHARD_PARTITIONING = def(9_042_00_0);
 
     /*
      * STOP! READ THIS FIRST! No, really,

diff --git a/.../plugin/esql/compute/src/main/java/org/elasticsearch/compute/lucene/DataPartitioning.java b/.../plugin/esql/compute/src/main/java/org/elasticsearch/compute/lucene/DataPartitioning.java
@@ -7,11 +7,39 @@
 
 package org.elasticsearch.compute.lucene;
 
-public enum DataPartitioning {
+import org.elasticsearch.compute.operator.Driver;
 
+/**
+ * How we partition the data across {@link Driver}s. Each request forks into
+ * {@code min(cpus, partition_count)} threads on the data node. More partitions
+ * allow us to bring more threads to bear on CPU intensive data node side tasks.
+ */
+public enum DataPartitioning {
+    /**
+     * Automatically select the data partitioning based on the query and index.
+     * Usually that's {@link #SEGMENT}, but for small indices it's {@link #SHARD}.
+     * When the additional overhead from {@link #DOC} is fairly low then it'll
+     * pick {@link #DOC}.
+     */
+    AUTO,
+    /**
+     * Make one partition per shard. This is generally the slowest option, but it
+     * has the lowest CPU overhead.
+     */
     SHARD,
-
+    /**
+     * Partition on segment boundaries, this doesn't allow forking to as many CPUs
+     * as {@link #DOC} but it has much lower overhead.
+     * <p>
+     * It packs segments smaller than {@link LuceneSliceQueue#MAX_DOCS_PER_SLICE}
+     * docs together into a partition. Larger segments get their own partition.
+     * Each slice contains no more than {@link LuceneSliceQueue#MAX_SEGMENTS_PER_SLICE}.
+     */
     SEGMENT,
-
+    /**
+     * Partition each shard into {@code task_concurrency} partitions, splitting
+     * larger segments into slices. This allows bringing the most CPUs to bear on
+     * the problem but adds extra overhead, especially in query preparation.
+     */
     DOC,
 }
diff --git a/...ugin/esql/compute/src/main/java/org/elasticsearch/compute/lucene/LuceneCountOperator.java b/...ugin/esql/compute/src/main/java/org/elasticsearch/compute/lucene/LuceneCountOperator.java
@@ -49,7 +49,15 @@ public Factory(
             int taskConcurrency,
             int limit
         ) {
-            super(contexts, queryFunction, dataPartitioning, taskConcurrency, limit, ScoreMode.COMPLETE_NO_SCORES);
+            super(
+                contexts,
+                queryFunction,
+                dataPartitioning,
+                query -> LuceneSliceQueue.PartitioningStrategy.SHARD,
+                taskConcurrency,
+                limit,
+                ScoreMode.COMPLETE_NO_SCORES
+            );
         }
 
         @Override

diff --git a/.../plugin/esql/compute/src/main/java/org/elasticsearch/compute/lucene/LuceneMaxFactory.java b/.../plugin/esql/compute/src/main/java/org/elasticsearch/compute/lucene/LuceneMaxFactory.java
@@ -121,7 +121,15 @@ public LuceneMaxFactory(
         NumberType numberType,
         int limit
     ) {
-        super(contexts, queryFunction, dataPartitioning, taskConcurrency, limit, ScoreMode.COMPLETE_NO_SCORES);
+        super(
+            contexts,
+            queryFunction,
+            dataPartitioning,
+            query -> LuceneSliceQueue.PartitioningStrategy.SHARD,
+            taskConcurrency,
+            limit,
+            ScoreMode.COMPLETE_NO_SCORES
+        );
         this.fieldName = fieldName;
         this.numberType = numberType;
     }

diff --git a/.../plugin/esql/compute/src/main/java/org/elasticsearch/compute/lucene/LuceneMinFactory.java b/.../plugin/esql/compute/src/main/java/org/elasticsearch/compute/lucene/LuceneMinFactory.java
@@ -121,7 +121,15 @@ public LuceneMinFactory(
         NumberType numberType,
         int limit
     ) {
-        super(contexts, queryFunction, dataPartitioning, taskConcurrency, limit, ScoreMode.COMPLETE_NO_SCORES);
+        super(
+            contexts,
+            queryFunction,
+            dataPartitioning,
+            query -> LuceneSliceQueue.PartitioningStrategy.SHARD,
+            taskConcurrency,
+            limit,
+            ScoreMode.COMPLETE_NO_SCORES
+        );
         this.fieldName = fieldName;
         this.numberType = numberType;
     }

diff --git a/...ck/plugin/esql/compute/src/main/java/org/elasticsearch/compute/lucene/LuceneOperator.java b/...ck/plugin/esql/compute/src/main/java/org/elasticsearch/compute/lucene/LuceneOperator.java
@@ -9,7 +9,6 @@
 
 import org.apache.lucene.index.LeafReaderContext;
 import org.apache.lucene.search.BulkScorer;
-import org.apache.lucene.search.ConstantScoreQuery;
 import org.apache.lucene.search.DocIdSetIterator;
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.LeafCollector;
@@ -38,12 +37,16 @@
 import java.util.Collections;
 import java.util.HashSet;
 import java.util.List;
+import java.util.Map;
 import java.util.Objects;
 import java.util.Set;
+import java.util.TreeMap;
 import java.util.TreeSet;
 import java.util.function.Function;
 import java.util.stream.Collectors;
 
+import static org.elasticsearch.TransportVersions.ESQL_REPORT_SHARD_PARTITIONING;
+
 public abstract class LuceneOperator extends SourceOperator {
     private static final Logger logger = LogManager.getLogger(LuceneOperator.class);
 
@@ -96,15 +99,15 @@ protected Factory(
             List<? extends ShardContext> contexts,
             Function<ShardContext, Query> queryFunction,
             DataPartitioning dataPartitioning,
+            Function<Query, LuceneSliceQueue.PartitioningStrategy> autoStrategy,
             int taskConcurrency,
             int limit,
             ScoreMode scoreMode
         ) {
             this.limit = limit;
             this.scoreMode = scoreMode;
             this.dataPartitioning = dataPartitioning;
-            var weightFunction = weightFunction(queryFunction, scoreMode);
-            this.sliceQueue = LuceneSliceQueue.create(contexts, weightFunction, dataPartitioning, taskConcurrency);
+            this.sliceQueue = LuceneSliceQueue.create(contexts, queryFunction, dataPartitioning, autoStrategy, taskConcurrency, scoreMode);
             this.taskConcurrency = Math.min(sliceQueue.totalSlices(), taskConcurrency);
         }
 
@@ -271,6 +274,7 @@ public static class Status implements Operator.Status {
         private final int sliceMax;
         private final int current;
         private final long rowsEmitted;
+        private final Map<String, LuceneSliceQueue.PartitioningStrategy> partitioningStrategies;
 
         private Status(LuceneOperator operator) {
             processedSlices = operator.processedSlices;
@@ -296,6 +300,7 @@ private Status(LuceneOperator operator) {
             }
             pagesEmitted = operator.pagesEmitted;
             rowsEmitted = operator.rowsEmitted;
+            partitioningStrategies = operator.sliceQueue.partitioningStrategies();
         }
 
         Status(
@@ -309,7 +314,8 @@ private Status(LuceneOperator operator) {
             int sliceMin,
             int sliceMax,
             int current,
-            long rowsEmitted
+            long rowsEmitted,
+            Map<String, LuceneSliceQueue.PartitioningStrategy> partitioningStrategies
         ) {
             this.processedSlices = processedSlices;
             this.processedQueries = processedQueries;
@@ -322,6 +328,7 @@ private Status(LuceneOperator operator) {
             this.sliceMax = sliceMax;
             this.current = current;
             this.rowsEmitted = rowsEmitted;
+            this.partitioningStrategies = partitioningStrategies;
         }
 
         Status(StreamInput in) throws IOException {
@@ -345,6 +352,9 @@ private Status(LuceneOperator operator) {
             } else {
                 rowsEmitted = 0;
             }
+            partitioningStrategies = in.getTransportVersion().onOrAfter(ESQL_REPORT_SHARD_PARTITIONING)
+                ? in.readMap(LuceneSliceQueue.PartitioningStrategy::readFrom)
+                : Map.of();
         }
 
         @Override
@@ -366,6 +376,9 @@ public void writeTo(StreamOutput out) throws IOException {
             if (out.getTransportVersion().onOrAfter(TransportVersions.ESQL_PROFILE_ROWS_PROCESSED)) {
                 out.writeVLong(rowsEmitted);
             }
+            if (out.getTransportVersion().onOrAfter(ESQL_REPORT_SHARD_PARTITIONING)) {
+                out.writeMap(partitioningStrategies, StreamOutput::writeString, StreamOutput::writeWriteable);
+            }
         }
 
         @Override
@@ -417,6 +430,10 @@ public long rowsEmitted() {
             return rowsEmitted;
         }
 
+        public Map<String, LuceneSliceQueue.PartitioningStrategy> partitioningStrategies() {
+            return partitioningStrategies;
+        }
+
         @Override
         public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
             builder.startObject();
@@ -434,6 +451,7 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws
             builder.field("slice_max", sliceMax);
             builder.field("current", current);
             builder.field("rows_emitted", rowsEmitted);
+            builder.field("partitioning_strategies", new TreeMap<>(this.partitioningStrategies));
             return builder.endObject();
         }
 
@@ -452,12 +470,23 @@ public boolean equals(Object o) {
                 && sliceMin == status.sliceMin
                 && sliceMax == status.sliceMax
                 && current == status.current
-                && rowsEmitted == status.rowsEmitted;
+                && rowsEmitted == status.rowsEmitted
+                && partitioningStrategies.equals(status.partitioningStrategies);
         }
 
         @Override
         public int hashCode() {
-            return Objects.hash(processedSlices, sliceIndex, totalSlices, pagesEmitted, sliceMin, sliceMax, current, rowsEmitted);
+            return Objects.hash(
+                processedSlices,
+                sliceIndex,
+                totalSlices,
+                pagesEmitted,
+                sliceMin,
+                sliceMax,
+                current,
+                rowsEmitted,
+                partitioningStrategies
+            );
         }
 
         @Override
@@ -470,17 +499,4 @@ public TransportVersion getMinimalSupportedVersion() {
             return TransportVersions.V_8_11_X;
         }
     }
-
-    static Function<ShardContext, Weight> weightFunction(Function<ShardContext, Query> queryFunction, ScoreMode scoreMode) {
-        return ctx -> {
-            final var query = queryFunction.apply(ctx);
-            final var searcher = ctx.searcher();
-            try {
-                Query actualQuery = scoreMode.needsScores() ? query : new ConstantScoreQuery(query);
-                return searcher.createWeight(searcher.rewrite(actualQuery), scoreMode, 1);
-            } catch (IOException e) {
-                throw new UncheckedIOException(e);
-            }
-        };
-    }
 }