elastic · jan-elastic · Jul 21, 2025 · Jul 25, 2025 · Jul 29, 2025 · Jul 30, 2025
diff --git a/.../plugin/core/src/main/java/org/elasticsearch/xpack/core/esql/action/EsqlQueryRequest.java b/.../plugin/core/src/main/java/org/elasticsearch/xpack/core/esql/action/EsqlQueryRequest.java
@@ -24,4 +24,6 @@ protected EsqlQueryRequest(StreamInput in) throws IOException {
     public abstract String query();
 
     public abstract QueryBuilder filter();
+
+    public abstract boolean approximate();
 }
diff --git a/.../core/src/main/java/org/elasticsearch/xpack/core/esql/action/EsqlQueryRequestBuilder.java b/.../core/src/main/java/org/elasticsearch/xpack/core/esql/action/EsqlQueryRequestBuilder.java
@@ -39,6 +39,8 @@ public final ActionType<Response> action() {
 
     public abstract EsqlQueryRequestBuilder<Request, Response> filter(QueryBuilder filter);
 
+    public abstract EsqlQueryRequestBuilder<Request, Response> approximate(boolean approximate);
+
     public abstract EsqlQueryRequestBuilder<Request, Response> allowPartialResults(boolean allowPartialResults);
 
 }
diff --git a/...plugin/esql/qa/testFixtures/src/main/java/org/elasticsearch/xpack/esql/CsvSpecReader.java b/...plugin/esql/qa/testFixtures/src/main/java/org/elasticsearch/xpack/esql/CsvSpecReader.java
@@ -31,6 +31,7 @@ public static class CsvSpecParser implements SpecReader.Parser {
         private final StringBuilder query = new StringBuilder();
         private final StringBuilder data = new StringBuilder();
         private final List<String> requiredCapabilities = new ArrayList<>();
+        private boolean approximate = false;
         private CsvTestCase testCase;
 
         private CsvSpecParser() {}
@@ -44,6 +45,8 @@ public Object parse(String line) {
                     earlySchema.append(line.substring(SCHEMA_PREFIX.length()).trim());
                 } else if (line.toLowerCase(Locale.ROOT).startsWith("required_capability:")) {
                     requiredCapabilities.add(line.substring("required_capability:".length()).trim());
+                } else if (line.toLowerCase(Locale.ROOT).startsWith("approximate:")) {
+                    approximate = Boolean.parseBoolean(line.substring("approximate:".length()).trim());
                 } else {
                     if (line.endsWith(";")) {
                         // pick up the query
@@ -52,9 +55,11 @@ public Object parse(String line) {
                         testCase.query = query.toString();
                         testCase.earlySchema = earlySchema.toString();
                         testCase.requiredCapabilities = List.copyOf(requiredCapabilities);
+                        testCase.approximate = approximate;
                         requiredCapabilities.clear();
                         earlySchema.setLength(0);
                         query.setLength(0);
+                        approximate = false;
                     }
                     // keep reading the query
                     else {
@@ -116,6 +121,7 @@ public static class CsvTestCase {
         private final List<Pattern> expectedWarningsRegex = new ArrayList<>();
         public boolean ignoreOrder;
         public List<String> requiredCapabilities = List.of();
+        public boolean approximate = false;
 
         /**
          * Returns the warning headers expected to be added by the test. To declare such a header, use the `warning:definition` format

diff --git a/...n/esql/qa/testFixtures/src/main/java/org/elasticsearch/xpack/esql/CsvTestsDataLoader.java b/...n/esql/qa/testFixtures/src/main/java/org/elasticsearch/xpack/esql/CsvTestsDataLoader.java
@@ -168,6 +168,7 @@ public class CsvTestsDataLoader {
     private static final TestDataset MV_TEXT = new TestDataset("mv_text");
     private static final TestDataset DENSE_VECTOR = new TestDataset("dense_vector");
     private static final TestDataset COLORS = new TestDataset("colors");
+    private static final TestDataset MANY_NUMBERS = new TestDataset("many_numbers");
 
     public static final Map<String, TestDataset> CSV_DATASET_MAP = Map.ofEntries(
         Map.entry(EMPLOYEES.indexName, EMPLOYEES),
@@ -234,7 +235,8 @@ public class CsvTestsDataLoader {
         Map.entry(DENSE_VECTOR.indexName, DENSE_VECTOR),
         Map.entry(COLORS.indexName, COLORS),
         Map.entry(MULTI_COLUMN_JOINABLE.indexName, MULTI_COLUMN_JOINABLE),
-        Map.entry(MULTI_COLUMN_JOINABLE_LOOKUP.indexName, MULTI_COLUMN_JOINABLE_LOOKUP)
+        Map.entry(MULTI_COLUMN_JOINABLE_LOOKUP.indexName, MULTI_COLUMN_JOINABLE_LOOKUP),
+        Map.entry(MANY_NUMBERS.indexName, MANY_NUMBERS)
     );
 
     private static final EnrichConfig LANGUAGES_ENRICH = new EnrichConfig("languages_policy", "enrich-policy-languages.json");

diff --git a/x-pack/plugin/esql/qa/testFixtures/src/main/resources/approximate.csv-spec b/x-pack/plugin/esql/qa/testFixtures/src/main/resources/approximate.csv-spec
@@ -0,0 +1,194 @@
+// Tests focused on query approximation.
+//
+// The data set "many_numbers" contains 10,000 rows with a multi-valued field
+// "value" containing 550 integers each. Aggregated all over rows, there are
+// 1x1, 2x2, ..., 999x999 and 1000x1000. It's not one number per row, because
+// that's too slow to ingest.
+//
+// Note: this tests only basic behavior, because of limitations of the CSV tests.
+// Most tests assert that the count, average and sum of some values are within a
+// range. All ranges are very loose, so that the tests should practically never fail.
+// The range checks are done in ES|QL, resulting in one boolean value (is_expected),
+// because the CSV tests don't support such assertions.
+
+approximate stats on large data
+required_capability: approximate
+approximate: true
+
+FROM many_numbers
+    | MV_EXPAND value
+    | STATS count=COUNT(), avg=AVG(value), sum=SUM(value)
+    | EVAL is_expected = count >= 450000 AND count <= 550000 AND
+                         avg >= 600 AND avg <= 750 AND
+                         sum >= 300000000 AND sum <= 380000000
+    | KEEP is_expected
+;
+
+is_expected:boolean
+true
+;
+
+
+exact stats on small data
+required_capability: approximate
+approximate: true
+
+FROM many_numbers
+    | MV_EXPAND value
+    | WHERE value <= 25
+    | STATS count=COUNT(), avg=AVG(value), sum=SUM(value)
+;
+
+count:long | avg:double | sum:long
+325        | 17.0       | 5525
+;
+
+
+with where
+required_capability: approximate
+approximate: true
+
+FROM many_numbers
+    | MV_EXPAND value
+    | WHERE value >= 500
+    | STATS count=COUNT(), avg=AVG(value), sum=SUM(value)
+    | EVAL is_expected = count >= 300000 AND count <= 450000 AND
+                         avg >= 700 AND avg <= 850 AND
+                         sum >= 270000000 AND sum <= 310000000
+    | KEEP is_expected
+;
+
+is_expected:boolean
+true
+;
+
+
+with stats where
+required_capability: approximate
+approximate: true
+
+FROM many_numbers
+    | MV_EXPAND value
+    | STATS count=COUNT() WHERE value >= 500,
+            avg=AVG(value) WHERE value >= 500,
+            sum=SUM(value) WHERE value >= 500
+    | EVAL is_expected = count >= 300000 AND count <= 450000 AND
+                         avg >= 700 AND avg <= 850 AND
+                         sum >= 270000000 AND sum <= 310000000
+    | KEEP is_expected
+;
+
+is_expected:boolean
+true
+;
+
+
+with sample
+required_capability: approximate
+approximate: true
+
+FROM many_numbers
+    | MV_EXPAND value
+    | SAMPLE 0.5
+    | STATS count=COUNT(), avg=AVG(value), sum=SUM(value)
+    | EVAL is_expected = count >= 200000 AND count <= 300000 AND
+                         avg >= 600 AND avg <= 750 AND
+                         sum >= 140000000 AND sum <= 200000000
+    | KEEP is_expected
+;
+
+is_expected:boolean
+true
+;
+
+
+with commands inbetween
+required_capability: approximate
+approximate: true
+
+FROM many_numbers
+    | MV_EXPAND value
+    | EVAL value2 = 2 * value
+    | DROP value
+    | SORT value2 DESC
+    | RENAME value2 AS value3
+    | MV_EXPAND value3
+    | EVAL value4 = TO_STRING(value3)
+    | SORT value4 ASC
+    | GROK value4 "%{NUMBER:value5}"
+    | EVAL value5 = TO_INTEGER(value5)
+    | KEEP value3, value5
+    | STATS count=COUNT(), avg=AVG(value5), sum=SUM(value5)
+    | EVAL is_expected = count >= 450000 AND count <= 550000 AND
+                         avg >= 1200 AND avg <= 1500 AND
+                         sum >= 600000000 AND sum <= 760000000
+    | KEEP is_expected
+;
+
+is_expected:boolean
+true
+;
+
+
+with commands after
+required_capability: approximate
+approximate: true
+
+FROM many_numbers
+    | MV_EXPAND value
+    | STATS count=COUNT(), avg=AVG(value), sum=SUM(value)
+    | EVAL avg2 = 2 * avg
+    | LIMIT 10
+    | MV_EXPAND avg
+    | SORT count ASC
+    | EVAL is_expected = count >= 450000 AND count <= 550000 AND
+                         avg2 >= 1200 AND avg2 <= 1500 AND
+                         sum >= 300000000 AND sum <= 380000000
+    | KEEP is_expected
+;
+
+is_expected:boolean
+true
+;
+
+
+approximate stats by on large data
+required_capability: approximate
+approximate: true
+
+FROM many_numbers
+    | MV_EXPAND value
+    | STATS count=COUNT() BY value
+    | SORT value DESC
+    | LIMIT 5
+    | EVAL is_expected = count >= 100 AND count <= 2000
+    | KEEP value, is_expected
+;
+
+value:integer | is_expected:boolean
+1000        | true
+999         | true
+998         | true
+997         | true
+996         | true
+;
+
+
+exact stats by on small data
+required_capability: approximate
+approximate: true
+
+FROM many_numbers
+    | MV_EXPAND value
+    | WHERE value <= 5
+    | STATS count=COUNT() BY value
+    | SORT value
+;
+
+count:long | value:integer
+1          | 1
+2          | 2
+3          | 3
+4          | 4
+5          | 5
+;