iteratively get sample probability

jan-elastic · jan-elastic · commit 319d98d71276 · 2025-07-29T16:57:28.000+02:00
diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/action/EsqlQueryRequest.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/action/EsqlQueryRequest.java
@@ -48,6 +48,7 @@ public class EsqlQueryRequest extends org.elasticsearch.xpack.core.esql.action.E
     private boolean includeCCSMetadata;
     private Locale locale;
     private QueryBuilder filter;
+    // TODO: discuss how to wire the approximation functionality in the API
     private boolean approximate;
     private QueryPragmas pragmas = new QueryPragmas(Settings.EMPTY);
     private QueryParams params = new QueryParams();
diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/approximate/Approximate.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/approximate/Approximate.java
@@ -7,6 +7,8 @@
 
 package org.elasticsearch.xpack.esql.approximate;
 
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
 import org.elasticsearch.action.ActionListener;
 import org.elasticsearch.compute.data.LongBlock;
 import org.elasticsearch.xpack.esql.core.InvalidArgumentException;
@@ -35,63 +37,81 @@
 import java.util.Locale;
 import java.util.Set;
 
+/**
+ * This class computes approximate and fast results for certain classes of
+ * ES|QL queries.
+ * <p>
+ * A query is suitable for approximation if it contains at least one
+ * {@code STATS} command, and all commands between the source and the leftmost
+ * {@code STATS} command can be swapped with {@code SAMPLE}. A command can be
+ * swapped with {@code SAMPLE} if it is either mapping one row to one row (e.g.
+ * {@code EVAL} or {@code GROK}), or if it is filtering rows (e.g. {@code FILTER}
+ * or {@code SAMPLE}). This is verified by {@link Approximate#verifyPlan}.
+ * <p>
+ * If this is the case, the {@code STATS} can be replaced by {@code SAMPLE} and
+ * a {@code STATS} with sample correction terms, and the {@code SAMPLE} can be
+ * moved to the source and executed inside Lucene. This new logical plan is
+ * generated by {@link Approximate#approximatePlan}.
+ * <p>
+ * To compute the appropriate sample probability, first a target number of rows
+ * is set. For now this is a fixed number ({@link Approximate#SAMPLE_ROW_COUNT}).
+ * <p>
+ * Next, the total number of rows in the source index is counted via the plan
+ * {@link Approximate#sourceCountPlan}. This plan should execute fast. When
+ * there are no filter commands, the sample probability can be directly
+ * computed as a ratio of the target number of rows and this total number.
+ * <p>
+ * In the presence of filters commands, another step is needed. The initial
+ * sample probability is set to the ratio above and the number of rows is
+ * sampled with the plan {@link Approximate#countPlan}. As long as the sampled
+ * number of rows is smaller than intended, the probability is scaled up until
+ * a good probability is reached. This final probability is then used for
+ * approximating the original plan.
+ */
 public class Approximate {
 
     public interface LogicalPlanRunner {
         void run(LogicalPlan plan, ActionListener<Result> listener);
     }
 
-    private static final Set<Class<? extends LogicalPlan>> SWAPPABLE_WITH_SAMPLE = Set.of(
+    private static final Set<Class<? extends LogicalPlan>> ONE_TO_ONE_COMMANDS = Set.of(
         Dissect.class,
         Drop.class,
         Eval.class,
-        Filter.class,
         Grok.class,
         Keep.class,
         OrderBy.class,
-        Rename.class,
-        Sample.class
+        Rename.class
     );
 
-    // TODO: not a good value
+    private static final Set<Class<? extends LogicalPlan>> FILTER_COMMANDS = Set.of(Filter.class, Sample.class);
+
+    // TODO: find a good default value, or alternative ways of setting it
     private static final int SAMPLE_ROW_COUNT = 1000;
 
+    private static final Logger logger = LogManager.getLogger(Approximate.class);
+
     private final LogicalPlan logicalPlan;
+    private final boolean hasFilters;
 
     public Approximate(LogicalPlan logicalPlan) {
         this.logicalPlan = logicalPlan;
-        verifyPlan();
+        this.hasFilters = verifyPlan();
     }
 
     /**
-     * Computes approximate results for the given logical plan.
-     *
-     * This works by first executing a plan that counts the number of rows
-     * getting to the aggregation. That count is used to compute a sample
-     * probability, which is then used to sample approximately 1000 rows
-     * to aggregate over and approximate the aggregation.
+     * Computes approximate results for the logical plan.
      */
     public void approximate(LogicalPlanRunner runner, ActionListener<Result> listener) {
-        runner.run(
-            countPlan(),
-            listener.delegateFailureAndWrap(
-                (countListener, countResult) -> runner.run(approximatePlan(sampleProbability(countResult)), listener)
-            )
-        );
+        runner.run(sourceCountPlan(), sourceCountListener(runner, listener));
     }
 
     /**
      * Verifies that a plan is suitable for approximation.
      *
-     * To be so, the plan must contain at least one STATS function, and all
-     * functions between the source and the leftmost STATS function must be
-     * swappable with SAMPLE.
-     *
-     * In that case, the STATS can be replaced by SAMPLE, STATS with sample
-     * correction terms, and the SAMPLE can be moved to the source and
-     * executed inside Lucene.
+     * @return whether the plan contains filters commands
      */
-    private void verifyPlan() {
+    private boolean verifyPlan() {
         if (logicalPlan.preOptimized() == false) {
             throw new IllegalStateException("Expected pre-optimized plan");
         }
@@ -101,39 +121,88 @@ private void verifyPlan() {
         }
 
         Holder<Boolean> encounteredStats = new Holder<>(false);
+        Holder<Boolean> hasFilters = new Holder<>(false);
         logicalPlan.transformUp(plan -> {
+            // TODO: check/fix for JOIN / FORK / INLINESTATS / ...
             if (plan instanceof LeafPlan) {
                 encounteredStats.set(false);
             } else if (encounteredStats.get() == false) {
                 if (plan instanceof Aggregate) {
                     encounteredStats.set(true);
-                } else if (SWAPPABLE_WITH_SAMPLE.contains(plan.getClass()) == false) {
+                } else if (ONE_TO_ONE_COMMANDS.contains(plan.getClass()) == false && FILTER_COMMANDS.contains(plan.getClass()) == false) {
                     throw new InvalidArgumentException(
                         "query with [" + plan.nodeName().toUpperCase(Locale.ROOT) + "] before [STATS] function cannot be approximated"
                     );
+                } else if (FILTER_COMMANDS.contains(plan.getClass())) {
+                    hasFilters.set(true);
                 }
             }
             return plan;
         });
+
+        return hasFilters.get();
+    }
+
+    /**
+     * Plan that counts the number of rows in the source index.
+     * This is the ES|QL query {@code FROM index | STATS COUNT(*)}.
+     */
+    private LogicalPlan sourceCountPlan() {
+        LogicalPlan sourceCountPlan = logicalPlan.transformUp(plan -> {
+            // TODO: check/fix for JOIN / FORK / INLINESTATS / ...
+            if (plan instanceof LeafPlan) {
+                plan = new Aggregate(
+                    Source.EMPTY,
+                    plan,
+                    List.of(),
+                    List.of(new Alias(Source.EMPTY, "approximate-count", new Count(Source.EMPTY, Literal.keyword(Source.EMPTY, "*"))))
+                );
+            } else {
+                plan = plan.children().getFirst();
+            }
+            return plan;
+        });
+
+        sourceCountPlan.setPreOptimized();
+        return sourceCountPlan;
+    }
+
+    /**
+     * Receives the total number of rows, and runs either the
+     * {@link Approximate#approximatePlan} or {@link Approximate#countPlan}
+     * depending on whether filter commands are present.
+     */
+    private ActionListener<Result> sourceCountListener(LogicalPlanRunner runner, ActionListener<Result> listener) {
+        return listener.delegateFailureAndWrap((countListener, countResult) -> {
+            logger.debug("sourceCountPlan result: {} rows", rowCount(countResult));
+            double sampleProbability = sampleProbability(countResult);
+            if (hasFilters) {
+                runner.run(countPlan(sampleProbability), countListener(runner, sampleProbability, listener));
+            } else {
+                runner.run(approximatePlan(sampleProbability), listener);
+            }
+        });
     }
 
     /**
-     * Returns a plan that counts the number of rows of the original plan that
-     * would reach the leftmost STATS function. So it's the original plan cut
-     * off at the leftmost STATS function, followed by "| STATS COUNT(*)".
-     * This value can be used to pick a good sample probability.
+     * Plan that counts the number of rows reaching the leftmost STATS function.
+     * This is number is approximated to speed up the query execution.
+     * This is the ES|QL query {@code FROM index | (...) | SAMPLE p | STATS COUNT(*) / p}.
      */
-    private LogicalPlan countPlan() {
+    private LogicalPlan countPlan(double sampleProbability) {
         Holder<Boolean> encounteredStats = new Holder<>(false);
         LogicalPlan countPlan = logicalPlan.transformUp(plan -> {
+            // TODO: check/fix for JOIN / FORK / INLINESTATS / ...
             if (plan instanceof LeafPlan) {
                 encounteredStats.set(false);
             } else if (encounteredStats.get() == false) {
                 if (plan instanceof Aggregate aggregate) {
                     encounteredStats.set(true);
+                    Expression sampleProbabilityExpr = new Literal(Source.EMPTY, sampleProbability, DataType.DOUBLE);
+                    Sample sample = new Sample(Source.EMPTY, sampleProbabilityExpr, aggregate.child());
                     plan = new Aggregate(
                         Source.EMPTY,
-                        aggregate.child(),
+                        sample,
                         List.of(),
                         List.of(new Alias(Source.EMPTY, "approximate-count", new Count(Source.EMPTY, Literal.keyword(Source.EMPTY, "*"))))
                     );
@@ -148,22 +217,54 @@ private LogicalPlan countPlan() {
         return countPlan;
     }
 
+    /**
+     * Receives the sampled number of rows reaching the leftmost STATS function.
+     * Runs either the {@link Approximate#approximatePlan} or a next iteration
+     * {@link Approximate#countPlan} depending on whether the current count is
+     * sufficient.
+     */
+    private ActionListener<Result> countListener(LogicalPlanRunner runner, double probability, ActionListener<Result> listener) {
+        return listener.delegateFailureAndWrap((countListener, countResult) -> {
+            long rowCount = rowCount(countResult);
+            logger.debug("countPlan result (p={}):{} rows", probability, rowCount);
+            double newProbability = probability * SAMPLE_ROW_COUNT / Math.max(1, rowCount);
+            if (rowCount <= SAMPLE_ROW_COUNT / 2 && newProbability < 1.0) {
+                runner.run(countPlan(newProbability), countListener(runner, newProbability, listener));
+            } else {
+                runner.run(approximatePlan(newProbability), listener);
+            }
+        });
+    }
+
     /**
      * Returns a sample probability based on the total number of rows.
      */
     private double sampleProbability(Result countResult) {
-        long rowCount = ((LongBlock) (countResult.pages().getFirst().getBlock(0))).getLong(0);
+        long rowCount = rowCount(countResult);
         return rowCount <= SAMPLE_ROW_COUNT ? 1.0 : (double) SAMPLE_ROW_COUNT / rowCount;
     }
 
+    /**
+     * Returns the row count in the result.
+     */
+    private long rowCount(Result countResult) {
+        return ((LongBlock) (countResult.pages().getFirst().getBlock(0))).getLong(0);
+    }
+
     /**
      * Returns a plan that approximates the original plan. It consists of the
      * original plan, with the leftmost STATS function replaced by:
      * "SAMPLE probability | STATS sample_corrected_aggs".
      */
     private LogicalPlan approximatePlan(double sampleProbability) {
+        if (sampleProbability >= 1.0) {
+            logger.debug("using original plan (too few rows)");
+            return logicalPlan;
+        }
+        logger.debug("generating approximate plan (p={})", sampleProbability);
         Holder<Boolean> encounteredStats = new Holder<>(false);
         LogicalPlan approximatePlan = logicalPlan.transformUp(plan -> {
+            // TODO: check/fix for JOIN / FORK / INLINESTATS / ...
             if (plan instanceof LeafPlan) {
                 encounteredStats.set(false);
             } else if (encounteredStats.get() == false) {