diff --git a/docs/changelog/125739.yaml b/docs/changelog/125739.yaml
new file mode 100644
index 0000000000000..cc5fa57b0f09b
--- /dev/null
+++ b/docs/changelog/125739.yaml
@@ -0,0 +1,5 @@
+pr: 125739
+summary: Heuristics to pick efficient partitioning
+area: ES|QL
+type: enhancement
+issues: []
diff --git a/server/src/main/java/org/elasticsearch/TransportVersions.java b/server/src/main/java/org/elasticsearch/TransportVersions.java
index c730a0fe4cf07..5e0a25dcccba7 100644
--- a/server/src/main/java/org/elasticsearch/TransportVersions.java
+++ b/server/src/main/java/org/elasticsearch/TransportVersions.java
@@ -217,6 +217,7 @@ static TransportVersion def(int id) {
public static final TransportVersion SEMANTIC_TEXT_CHUNKING_CONFIG = def(9_047_00_0);
public static final TransportVersion REPO_ANALYSIS_COPY_BLOB = def(9_048_00_0);
public static final TransportVersion AMAZON_BEDROCK_TASK_SETTINGS = def(9_049_00_0);
+ public static final TransportVersion ESQL_REPORT_SHARD_PARTITIONING = def(9_050_00_0);
/*
* STOP! READ THIS FIRST! No, really,
diff --git a/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/lucene/DataPartitioning.java b/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/lucene/DataPartitioning.java
index 926b9e08d2e08..2529b060a9998 100644
--- a/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/lucene/DataPartitioning.java
+++ b/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/lucene/DataPartitioning.java
@@ -7,11 +7,39 @@
package org.elasticsearch.compute.lucene;
-public enum DataPartitioning {
+import org.elasticsearch.compute.operator.Driver;
+/**
+ * How we partition the data across {@link Driver}s. Each request forks into
+ * {@code min(1.5 * cpus, partition_count)} threads on the data node. More partitions
+ * allow us to bring more threads to bear on CPU intensive data node side tasks.
+ */
+public enum DataPartitioning {
+ /**
+ * Automatically select the data partitioning based on the query and index.
+ * Usually that's {@link #SEGMENT}, but for small indices it's {@link #SHARD}.
+ * When the additional overhead from {@link #DOC} is fairly low then it'll
+ * pick {@link #DOC}.
+ */
+ AUTO,
+ /**
+ * Make one partition per shard. This is generally the slowest option, but it
+ * has the lowest CPU overhead.
+ */
SHARD,
-
+ /**
+ * Partition on segment boundaries, this doesn't allow forking to as many CPUs
+ * as {@link #DOC} but it has much lower overhead.
+ *
+ * It packs segments smaller than {@link LuceneSliceQueue#MAX_DOCS_PER_SLICE}
+ * docs together into a partition. Larger segments get their own partition.
+ * Each slice contains no more than {@link LuceneSliceQueue#MAX_SEGMENTS_PER_SLICE}.
+ */
SEGMENT,
-
+ /**
+ * Partition each shard into {@code task_concurrency} partitions, splitting
+ * larger segments into slices. This allows bringing the most CPUs to bear on
+ * the problem but adds extra overhead, especially in query preparation.
+ */
DOC,
}
diff --git a/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/lucene/LuceneCountOperator.java b/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/lucene/LuceneCountOperator.java
index 327303c45ad4b..5fadf98a9d823 100644
--- a/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/lucene/LuceneCountOperator.java
+++ b/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/lucene/LuceneCountOperator.java
@@ -49,7 +49,16 @@ public Factory(
int taskConcurrency,
int limit
) {
- super(contexts, weightFunction(queryFunction, ScoreMode.COMPLETE_NO_SCORES), dataPartitioning, taskConcurrency, limit, false);
+ super(
+ contexts,
+ queryFunction,
+ dataPartitioning,
+ query -> LuceneSliceQueue.PartitioningStrategy.SHARD,
+ taskConcurrency,
+ limit,
+ false,
+ ScoreMode.COMPLETE_NO_SCORES
+ );
}
@Override
diff --git a/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/lucene/LuceneMaxFactory.java b/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/lucene/LuceneMaxFactory.java
index 3343750562cf5..5e91f2b80bcec 100644
--- a/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/lucene/LuceneMaxFactory.java
+++ b/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/lucene/LuceneMaxFactory.java
@@ -23,8 +23,6 @@
import java.util.List;
import java.util.function.Function;
-import static org.elasticsearch.compute.lucene.LuceneOperator.weightFunction;
-
/**
* Factory that generates an operator that finds the max value of a field using the {@link LuceneMinMaxOperator}.
*/
@@ -123,7 +121,16 @@ public LuceneMaxFactory(
NumberType numberType,
int limit
) {
- super(contexts, weightFunction(queryFunction, ScoreMode.COMPLETE_NO_SCORES), dataPartitioning, taskConcurrency, limit, false);
+ super(
+ contexts,
+ queryFunction,
+ dataPartitioning,
+ query -> LuceneSliceQueue.PartitioningStrategy.SHARD,
+ taskConcurrency,
+ limit,
+ false,
+ ScoreMode.COMPLETE_NO_SCORES
+ );
this.fieldName = fieldName;
this.numberType = numberType;
}
diff --git a/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/lucene/LuceneMinFactory.java b/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/lucene/LuceneMinFactory.java
index 5f0849e882813..fc457ae196186 100644
--- a/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/lucene/LuceneMinFactory.java
+++ b/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/lucene/LuceneMinFactory.java
@@ -23,8 +23,6 @@
import java.util.List;
import java.util.function.Function;
-import static org.elasticsearch.compute.lucene.LuceneOperator.weightFunction;
-
/**
* Factory that generates an operator that finds the min value of a field using the {@link LuceneMinMaxOperator}.
*/
@@ -123,7 +121,16 @@ public LuceneMinFactory(
NumberType numberType,
int limit
) {
- super(contexts, weightFunction(queryFunction, ScoreMode.COMPLETE_NO_SCORES), dataPartitioning, taskConcurrency, limit, false);
+ super(
+ contexts,
+ queryFunction,
+ dataPartitioning,
+ query -> LuceneSliceQueue.PartitioningStrategy.SHARD,
+ taskConcurrency,
+ limit,
+ false,
+ ScoreMode.COMPLETE_NO_SCORES
+ );
this.fieldName = fieldName;
this.numberType = numberType;
}
diff --git a/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/lucene/LuceneOperator.java b/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/lucene/LuceneOperator.java
index 2279603432d2f..9bd5af16b094f 100644
--- a/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/lucene/LuceneOperator.java
+++ b/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/lucene/LuceneOperator.java
@@ -9,7 +9,6 @@
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.search.BulkScorer;
-import org.apache.lucene.search.ConstantScoreQuery;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.LeafCollector;
import org.apache.lucene.search.Query;
@@ -37,12 +36,16 @@
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
+import java.util.Map;
import java.util.Objects;
import java.util.Set;
+import java.util.TreeMap;
import java.util.TreeSet;
import java.util.function.Function;
import java.util.stream.Collectors;
+import static org.elasticsearch.TransportVersions.ESQL_REPORT_SHARD_PARTITIONING;
+
public abstract class LuceneOperator extends SourceOperator {
private static final Logger logger = LogManager.getLogger(LuceneOperator.class);
@@ -93,15 +96,17 @@ public abstract static class Factory implements SourceOperator.SourceOperatorFac
*/
protected Factory(
List extends ShardContext> contexts,
- Function weightFunction,
+ Function queryFunction,
DataPartitioning dataPartitioning,
+ Function autoStrategy,
int taskConcurrency,
int limit,
- boolean needsScore
+ boolean needsScore,
+ ScoreMode scoreMode
) {
this.limit = limit;
this.dataPartitioning = dataPartitioning;
- this.sliceQueue = LuceneSliceQueue.create(contexts, weightFunction, dataPartitioning, taskConcurrency);
+ this.sliceQueue = LuceneSliceQueue.create(contexts, queryFunction, dataPartitioning, autoStrategy, taskConcurrency, scoreMode);
this.taskConcurrency = Math.min(sliceQueue.totalSlices(), taskConcurrency);
this.needsScore = needsScore;
}
@@ -269,6 +274,7 @@ public static class Status implements Operator.Status {
private final int sliceMax;
private final int current;
private final long rowsEmitted;
+ private final Map partitioningStrategies;
private Status(LuceneOperator operator) {
processedSlices = operator.processedSlices;
@@ -294,6 +300,7 @@ private Status(LuceneOperator operator) {
}
pagesEmitted = operator.pagesEmitted;
rowsEmitted = operator.rowsEmitted;
+ partitioningStrategies = operator.sliceQueue.partitioningStrategies();
}
Status(
@@ -307,7 +314,8 @@ private Status(LuceneOperator operator) {
int sliceMin,
int sliceMax,
int current,
- long rowsEmitted
+ long rowsEmitted,
+ Map partitioningStrategies
) {
this.processedSlices = processedSlices;
this.processedQueries = processedQueries;
@@ -320,6 +328,7 @@ private Status(LuceneOperator operator) {
this.sliceMax = sliceMax;
this.current = current;
this.rowsEmitted = rowsEmitted;
+ this.partitioningStrategies = partitioningStrategies;
}
Status(StreamInput in) throws IOException {
@@ -343,6 +352,9 @@ private Status(LuceneOperator operator) {
} else {
rowsEmitted = 0;
}
+ partitioningStrategies = in.getTransportVersion().onOrAfter(ESQL_REPORT_SHARD_PARTITIONING)
+ ? in.readMap(LuceneSliceQueue.PartitioningStrategy::readFrom)
+ : Map.of();
}
@Override
@@ -364,6 +376,9 @@ public void writeTo(StreamOutput out) throws IOException {
if (out.getTransportVersion().onOrAfter(TransportVersions.ESQL_PROFILE_ROWS_PROCESSED)) {
out.writeVLong(rowsEmitted);
}
+ if (out.getTransportVersion().onOrAfter(ESQL_REPORT_SHARD_PARTITIONING)) {
+ out.writeMap(partitioningStrategies, StreamOutput::writeString, StreamOutput::writeWriteable);
+ }
}
@Override
@@ -415,6 +430,10 @@ public long rowsEmitted() {
return rowsEmitted;
}
+ public Map partitioningStrategies() {
+ return partitioningStrategies;
+ }
+
@Override
public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
builder.startObject();
@@ -432,6 +451,7 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws
builder.field("slice_max", sliceMax);
builder.field("current", current);
builder.field("rows_emitted", rowsEmitted);
+ builder.field("partitioning_strategies", new TreeMap<>(this.partitioningStrategies));
return builder.endObject();
}
@@ -450,12 +470,23 @@ public boolean equals(Object o) {
&& sliceMin == status.sliceMin
&& sliceMax == status.sliceMax
&& current == status.current
- && rowsEmitted == status.rowsEmitted;
+ && rowsEmitted == status.rowsEmitted
+ && partitioningStrategies.equals(status.partitioningStrategies);
}
@Override
public int hashCode() {
- return Objects.hash(processedSlices, sliceIndex, totalSlices, pagesEmitted, sliceMin, sliceMax, current, rowsEmitted);
+ return Objects.hash(
+ processedSlices,
+ sliceIndex,
+ totalSlices,
+ pagesEmitted,
+ sliceMin,
+ sliceMax,
+ current,
+ rowsEmitted,
+ partitioningStrategies
+ );
}
@Override
@@ -468,17 +499,4 @@ public TransportVersion getMinimalSupportedVersion() {
return TransportVersions.V_8_11_X;
}
}
-
- static Function weightFunction(Function queryFunction, ScoreMode scoreMode) {
- return ctx -> {
- final var query = queryFunction.apply(ctx);
- final var searcher = ctx.searcher();
- try {
- Query actualQuery = scoreMode.needsScores() ? query : new ConstantScoreQuery(query);
- return searcher.createWeight(searcher.rewrite(actualQuery), scoreMode, 1);
- } catch (IOException e) {
- throw new UncheckedIOException(e);
- }
- };
- }
}
diff --git a/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/lucene/LuceneSliceQueue.java b/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/lucene/LuceneSliceQueue.java
index 0407e0f726044..4a8847e2870aa 100644
--- a/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/lucene/LuceneSliceQueue.java
+++ b/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/lucene/LuceneSliceQueue.java
@@ -7,17 +7,25 @@
package org.elasticsearch.compute.lucene;
-import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.search.ConstantScoreQuery;
import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.ScoreMode;
import org.apache.lucene.search.Weight;
+import org.elasticsearch.common.io.stream.StreamInput;
+import org.elasticsearch.common.io.stream.StreamOutput;
+import org.elasticsearch.common.io.stream.Writeable;
import org.elasticsearch.core.Nullable;
+import java.io.IOException;
+import java.io.UncheckedIOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
-import java.util.Collections;
+import java.util.HashMap;
import java.util.List;
+import java.util.Map;
import java.util.Queue;
import java.util.concurrent.ConcurrentLinkedQueue;
import java.util.function.Function;
@@ -26,15 +34,17 @@
* Shared Lucene slices between Lucene operators.
*/
public final class LuceneSliceQueue {
- private static final int MAX_DOCS_PER_SLICE = 250_000; // copied from IndexSearcher
- private static final int MAX_SEGMENTS_PER_SLICE = 5; // copied from IndexSearcher
+ public static final int MAX_DOCS_PER_SLICE = 250_000; // copied from IndexSearcher
+ public static final int MAX_SEGMENTS_PER_SLICE = 5; // copied from IndexSearcher
private final int totalSlices;
private final Queue slices;
+ private final Map partitioningStrategies;
- private LuceneSliceQueue(List slices) {
+ private LuceneSliceQueue(List slices, Map partitioningStrategies) {
this.totalSlices = slices.size();
this.slices = new ConcurrentLinkedQueue<>(slices);
+ this.partitioningStrategies = partitioningStrategies;
}
@Nullable
@@ -46,82 +56,196 @@ public int totalSlices() {
return totalSlices;
}
+ /**
+ * Strategy used to partition each shard in this queue.
+ */
+ public Map partitioningStrategies() {
+ return partitioningStrategies;
+ }
+
public Collection remainingShardsIdentifiers() {
return slices.stream().map(slice -> slice.shardContext().shardIdentifier()).toList();
}
public static LuceneSliceQueue create(
List extends ShardContext> contexts,
- Function weightFunction,
+ Function queryFunction,
DataPartitioning dataPartitioning,
- int taskConcurrency
+ Function autoStrategy,
+ int taskConcurrency,
+ ScoreMode scoreMode
) {
- final List slices = new ArrayList<>();
+ List slices = new ArrayList<>();
+ Map partitioningStrategies = new HashMap<>(contexts.size());
for (ShardContext ctx : contexts) {
- final List leafContexts = ctx.searcher().getLeafContexts();
- List> groups = switch (dataPartitioning) {
- case SHARD -> Collections.singletonList(leafContexts.stream().map(PartialLeafReaderContext::new).toList());
- case SEGMENT -> segmentSlices(leafContexts);
- case DOC -> docSlices(ctx.searcher().getIndexReader(), taskConcurrency);
- };
- final Weight weight = weightFunction.apply(ctx);
+ Query query = queryFunction.apply(ctx);
+ query = scoreMode.needsScores() ? query : new ConstantScoreQuery(query);
+ /*
+ * Rewrite the query on the local index so things like fully
+ * overlapping range queries become match all. It's important
+ * to do this before picking the partitioning strategy so we
+ * can pick more aggressive strategies when the query rewrites
+ * into MatchAll.
+ */
+ try {
+ query = ctx.searcher().rewrite(query);
+ } catch (IOException e) {
+ throw new UncheckedIOException(e);
+ }
+ PartitioningStrategy partitioning = PartitioningStrategy.pick(dataPartitioning, autoStrategy, ctx, query);
+ partitioningStrategies.put(ctx.shardIdentifier(), partitioning);
+ List> groups = partitioning.groups(ctx.searcher(), taskConcurrency);
+ Weight weight = weight(ctx, query, scoreMode);
for (List group : groups) {
if (group.isEmpty() == false) {
slices.add(new LuceneSlice(ctx, group, weight));
}
}
}
- return new LuceneSliceQueue(slices);
+ return new LuceneSliceQueue(slices, partitioningStrategies);
}
- static List> docSlices(IndexReader indexReader, int numSlices) {
- final int totalDocCount = indexReader.maxDoc();
- final int normalMaxDocsPerSlice = totalDocCount / numSlices;
- final int extraDocsInFirstSlice = totalDocCount % numSlices;
- final List> slices = new ArrayList<>();
- int docsAllocatedInCurrentSlice = 0;
- List currentSlice = null;
- int maxDocsPerSlice = normalMaxDocsPerSlice + extraDocsInFirstSlice;
- for (LeafReaderContext ctx : indexReader.leaves()) {
- final int numDocsInLeaf = ctx.reader().maxDoc();
- int minDoc = 0;
- while (minDoc < numDocsInLeaf) {
- int numDocsToUse = Math.min(maxDocsPerSlice - docsAllocatedInCurrentSlice, numDocsInLeaf - minDoc);
- if (numDocsToUse <= 0) {
- break;
- }
- if (currentSlice == null) {
- currentSlice = new ArrayList<>();
+ /**
+ * Strategy used to partition each shard into slices. See {@link DataPartitioning}
+ * for descriptions on how each value works.
+ */
+ public enum PartitioningStrategy implements Writeable {
+ /**
+ * See {@link DataPartitioning#SHARD}.
+ */
+ SHARD(0) {
+ @Override
+ List> groups(IndexSearcher searcher, int requestedNumSlices) {
+ return List.of(searcher.getLeafContexts().stream().map(PartialLeafReaderContext::new).toList());
+ }
+ },
+ /**
+ * See {@link DataPartitioning#SEGMENT}.
+ */
+ SEGMENT(1) {
+ @Override
+ List> groups(IndexSearcher searcher, int requestedNumSlices) {
+ IndexSearcher.LeafSlice[] gs = IndexSearcher.slices(
+ searcher.getLeafContexts(),
+ MAX_DOCS_PER_SLICE,
+ MAX_SEGMENTS_PER_SLICE,
+ false
+ );
+ return Arrays.stream(gs).map(g -> Arrays.stream(g.partitions).map(PartialLeafReaderContext::new).toList()).toList();
+ }
+ },
+ /**
+ * See {@link DataPartitioning#DOC}.
+ */
+ DOC(2) {
+ @Override
+ List> groups(IndexSearcher searcher, int requestedNumSlices) {
+ final int totalDocCount = searcher.getIndexReader().maxDoc();
+ final int normalMaxDocsPerSlice = totalDocCount / requestedNumSlices;
+ final int extraDocsInFirstSlice = totalDocCount % requestedNumSlices;
+ final List> slices = new ArrayList<>();
+ int docsAllocatedInCurrentSlice = 0;
+ List currentSlice = null;
+ int maxDocsPerSlice = normalMaxDocsPerSlice + extraDocsInFirstSlice;
+ for (LeafReaderContext ctx : searcher.getLeafContexts()) {
+ final int numDocsInLeaf = ctx.reader().maxDoc();
+ int minDoc = 0;
+ while (minDoc < numDocsInLeaf) {
+ int numDocsToUse = Math.min(maxDocsPerSlice - docsAllocatedInCurrentSlice, numDocsInLeaf - minDoc);
+ if (numDocsToUse <= 0) {
+ break;
+ }
+ if (currentSlice == null) {
+ currentSlice = new ArrayList<>();
+ }
+ currentSlice.add(new PartialLeafReaderContext(ctx, minDoc, minDoc + numDocsToUse));
+ minDoc += numDocsToUse;
+ docsAllocatedInCurrentSlice += numDocsToUse;
+ if (docsAllocatedInCurrentSlice == maxDocsPerSlice) {
+ slices.add(currentSlice);
+ // once the first slice with the extra docs is added, no need for extra docs
+ maxDocsPerSlice = normalMaxDocsPerSlice;
+ currentSlice = null;
+ docsAllocatedInCurrentSlice = 0;
+ }
+ }
}
- currentSlice.add(new PartialLeafReaderContext(ctx, minDoc, minDoc + numDocsToUse));
- minDoc += numDocsToUse;
- docsAllocatedInCurrentSlice += numDocsToUse;
- if (docsAllocatedInCurrentSlice == maxDocsPerSlice) {
+ if (currentSlice != null) {
slices.add(currentSlice);
- maxDocsPerSlice = normalMaxDocsPerSlice; // once the first slice with the extra docs is added, no need for extra docs
- currentSlice = null;
- docsAllocatedInCurrentSlice = 0;
}
+ if (requestedNumSlices < totalDocCount && slices.size() != requestedNumSlices) {
+ throw new IllegalStateException("wrong number of slices, expected " + requestedNumSlices + " but got " + slices.size());
+ }
+ if (slices.stream()
+ .flatMapToInt(
+ l -> l.stream()
+ .mapToInt(partialLeafReaderContext -> partialLeafReaderContext.maxDoc() - partialLeafReaderContext.minDoc())
+ )
+ .sum() != totalDocCount) {
+ throw new IllegalStateException("wrong doc count");
+ }
+ return slices;
}
+ };
+
+ private final byte id;
+
+ PartitioningStrategy(int id) {
+ this.id = (byte) id;
}
- if (currentSlice != null) {
- slices.add(currentSlice);
+
+ public static PartitioningStrategy readFrom(StreamInput in) throws IOException {
+ int id = in.readByte();
+ return switch (id) {
+ case 0 -> SHARD;
+ case 1 -> SEGMENT;
+ case 2 -> DOC;
+ default -> throw new IllegalArgumentException("invalid PartitioningStrategyId [" + id + "]");
+ };
}
- if (numSlices < totalDocCount && slices.size() != numSlices) {
- throw new IllegalStateException("wrong number of slices, expected " + numSlices + " but got " + slices.size());
+
+ @Override
+ public void writeTo(StreamOutput out) throws IOException {
+ out.writeByte(id);
}
- if (slices.stream()
- .flatMapToInt(
- l -> l.stream().mapToInt(partialLeafReaderContext -> partialLeafReaderContext.maxDoc() - partialLeafReaderContext.minDoc())
- )
- .sum() != totalDocCount) {
- throw new IllegalStateException("wrong doc count");
+
+ abstract List> groups(IndexSearcher searcher, int requestedNumSlices);
+
+ private static PartitioningStrategy pick(
+ DataPartitioning dataPartitioning,
+ Function autoStrategy,
+ ShardContext ctx,
+ Query query
+ ) {
+ return switch (dataPartitioning) {
+ case SHARD -> PartitioningStrategy.SHARD;
+ case SEGMENT -> PartitioningStrategy.SEGMENT;
+ case DOC -> PartitioningStrategy.DOC;
+ case AUTO -> forAuto(autoStrategy, ctx, query);
+ };
+ }
+
+ /**
+ * {@link DataPartitioning#AUTO} resolves to {@link #SHARD} for indices
+ * with fewer than this many documents.
+ */
+ private static final int SMALL_INDEX_BOUNDARY = MAX_DOCS_PER_SLICE;
+
+ private static PartitioningStrategy forAuto(Function autoStrategy, ShardContext ctx, Query query) {
+ if (ctx.searcher().getIndexReader().maxDoc() < SMALL_INDEX_BOUNDARY) {
+ return PartitioningStrategy.SHARD;
+ }
+ return autoStrategy.apply(query);
}
- return slices;
}
- static List> segmentSlices(List leafContexts) {
- IndexSearcher.LeafSlice[] gs = IndexSearcher.slices(leafContexts, MAX_DOCS_PER_SLICE, MAX_SEGMENTS_PER_SLICE, false);
- return Arrays.stream(gs).map(g -> Arrays.stream(g.partitions).map(PartialLeafReaderContext::new).toList()).toList();
+ static Weight weight(ShardContext ctx, Query query, ScoreMode scoreMode) {
+ var searcher = ctx.searcher();
+ try {
+ Query actualQuery = scoreMode.needsScores() ? query : new ConstantScoreQuery(query);
+ return searcher.createWeight(actualQuery, scoreMode, 1);
+ } catch (IOException e) {
+ throw new UncheckedIOException(e);
+ }
}
}
diff --git a/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/lucene/LuceneSourceOperator.java b/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/lucene/LuceneSourceOperator.java
index 63dbf2926275e..51b842d3f0ddc 100644
--- a/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/lucene/LuceneSourceOperator.java
+++ b/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/lucene/LuceneSourceOperator.java
@@ -7,8 +7,14 @@
package org.elasticsearch.compute.lucene;
+import org.apache.lucene.search.BooleanClause;
+import org.apache.lucene.search.BooleanQuery;
+import org.apache.lucene.search.BoostQuery;
import org.apache.lucene.search.CollectionTerminatedException;
+import org.apache.lucene.search.ConstantScoreQuery;
import org.apache.lucene.search.LeafCollector;
+import org.apache.lucene.search.MatchAllDocsQuery;
+import org.apache.lucene.search.MatchNoDocsQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Scorable;
import org.elasticsearch.compute.data.BlockFactory;
@@ -17,22 +23,30 @@
import org.elasticsearch.compute.data.DoubleVector;
import org.elasticsearch.compute.data.IntVector;
import org.elasticsearch.compute.data.Page;
+import org.elasticsearch.compute.lucene.LuceneSliceQueue.PartitioningStrategy;
import org.elasticsearch.compute.operator.DriverContext;
import org.elasticsearch.compute.operator.Limiter;
import org.elasticsearch.compute.operator.SourceOperator;
import org.elasticsearch.core.Releasables;
+import org.elasticsearch.logging.LogManager;
+import org.elasticsearch.logging.Logger;
import java.io.IOException;
+import java.util.ArrayList;
import java.util.List;
import java.util.function.Function;
import static org.apache.lucene.search.ScoreMode.COMPLETE;
import static org.apache.lucene.search.ScoreMode.COMPLETE_NO_SCORES;
+import static org.elasticsearch.compute.lucene.LuceneSliceQueue.PartitioningStrategy.DOC;
+import static org.elasticsearch.compute.lucene.LuceneSliceQueue.PartitioningStrategy.SEGMENT;
+import static org.elasticsearch.compute.lucene.LuceneSliceQueue.PartitioningStrategy.SHARD;
/**
* Source operator that incrementally runs Lucene searches
*/
public class LuceneSourceOperator extends LuceneOperator {
+ private static final Logger log = LogManager.getLogger(LuceneSourceOperator.class);
private int currentPagePos = 0;
private int remainingDocs;
@@ -59,11 +73,13 @@ public Factory(
) {
super(
contexts,
- weightFunction(queryFunction, needsScore ? COMPLETE : COMPLETE_NO_SCORES),
+ queryFunction,
dataPartitioning,
+ autoStrategy(limit),
taskConcurrency,
limit,
- needsScore
+ needsScore,
+ needsScore ? COMPLETE : COMPLETE_NO_SCORES
);
this.maxPageSize = maxPageSize;
// TODO: use a single limiter for multiple stage execution
@@ -91,6 +107,110 @@ public String describe() {
+ needsScore
+ "]";
}
+
+ /**
+ * Pick a strategy for the {@link DataPartitioning#AUTO} partitioning.
+ */
+ public static Function autoStrategy(int limit) {
+ return limit == NO_LIMIT ? Factory::highSpeedAutoStrategy : Factory::lowOverheadAutoStrategy;
+ }
+
+ /**
+ * Use the {@link PartitioningStrategy#SHARD} strategy because
+ * it has the lowest overhead. Used when there is a {@code limit} on the operator
+ * because that's for cases like {@code FROM foo | LIMIT 10} or
+ * {@code FROM foo | WHERE a == 1 | LIMIT 10} when the {@code WHERE} can be pushed
+ * to Lucene. In those cases we're better off with the lowest overhead we can
+ * manage - and that's {@link PartitioningStrategy#SHARD}.
+ */
+ private static PartitioningStrategy lowOverheadAutoStrategy(Query query) {
+ return SHARD;
+ }
+
+ /**
+ * Select the {@link PartitioningStrategy} based on the {@link Query}.
+ *
+ *
+ * If the {@linkplain Query} matches no documents then this will
+ * use the {@link PartitioningStrategy#SHARD} strategy so we minimize the overhead
+ * of finding nothing.
+ *
+ *
+ * If the {@linkplain Query} matches all documents then this will
+ * use the {@link PartitioningStrategy#DOC} strategy because the overhead of using
+ * that strategy for {@link MatchAllDocsQuery} is very low, and we need as many CPUs
+ * as we can get to process all the documents.
+ *
+ *
+ * Otherwise use the {@link PartitioningStrategy#SEGMENT} strategy because it's
+ * overhead is generally low.
+ *