Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
46 commits
Select commit Hold shift + click to select a range
f983fee
Approximate ESQL stats execution using 1000 documents
jan-elastic Jul 21, 2025
bb37a73
refactor a bit
jan-elastic Jul 25, 2025
3f2840a
iteratively get sample probability
jan-elastic Jul 29, 2025
bfe0290
"Fix" JOIN/FORK/INLINESTATS
jan-elastic Jul 30, 2025
56ce4c5
CSV tests
jan-elastic Jul 31, 2025
c12daa8
close resources
jan-elastic Jul 31, 2025
44d9a31
better verification errors + tests
jan-elastic Aug 1, 2025
4faeb04
test row sampling behavior
jan-elastic Aug 1, 2025
8ee4bf6
add capability
jan-elastic Aug 1, 2025
2d0925a
remove debug
jan-elastic Aug 1, 2025
f23ef28
Add CSV test with STATS ... WHERE
jan-elastic Aug 27, 2025
7a73714
[CI] Auto commit changes from spotless
Aug 27, 2025
4401f54
fix csv test
jan-elastic Aug 27, 2025
a1ec788
ES|QL random
jan-elastic Sep 9, 2025
70696f4
ES|QL confidence_interval
jan-elastic Sep 9, 2025
515669e
wip approximate
jan-elastic Sep 9, 2025
84e0fb5
query with confidence interval
jan-elastic Sep 10, 2025
db47113
data types
jan-elastic Sep 11, 2025
25437db
correct stats for bucketing
jan-elastic Sep 15, 2025
6a7c619
add empty buckets
jan-elastic Sep 19, 2025
cd8d7b4
improve whitelisting plans
jan-elastic Sep 22, 2025
b87ff6f
move sample to front
jan-elastic Sep 22, 2025
e53fd3d
rename sampleId -> bucketId
jan-elastic Sep 22, 2025
874ebda
move final bucketId agg to the end
jan-elastic Sep 23, 2025
ea746fb
Fix precision issue
jan-elastic Sep 24, 2025
470eb35
seperate confidence interval column + fix to_string/date etc
jan-elastic Sep 26, 2025
e0d0594
One column per bucket
jan-elastic Sep 30, 2025
eb9bf35
Filter null buckets
jan-elastic Oct 1, 2025
fd6a227
whitelist agg functions
jan-elastic Oct 2, 2025
85d1e61
Move sample correction to approximate class
jan-elastic Oct 2, 2025
1cfa14d
disallow chained stats
jan-elastic Oct 2, 2025
b12edfc
blacklist function that may output multivalued
jan-elastic Oct 2, 2025
8f65b6f
Polish code + add documentation
jan-elastic Oct 2, 2025
13fca52
move ConfidenceInterval class
jan-elastic Oct 6, 2025
3576702
fix CsvTests
jan-elastic Oct 6, 2025
1793fd7
whitelist supported processing commands
jan-elastic Oct 6, 2025
ed6346f
fix + extend ApproximateTests
jan-elastic Oct 6, 2025
61f9bf4
[CI] Auto commit changes from spotless
Oct 7, 2025
ce0acd1
remove debug
jan-elastic Oct 7, 2025
4af56e2
fix merge error
jan-elastic Oct 7, 2025
4a6d540
move+hide+improve confidence interval computation
jan-elastic Oct 7, 2025
308c449
Add reliable computation
jan-elastic Oct 8, 2025
29544c7
more verification tests
jan-elastic Oct 8, 2025
cbee62b
trials for confidence/reliable
jan-elastic Oct 9, 2025
e9f015b
optimize many_numbers test
jan-elastic Oct 9, 2025
790ade4
spotless
jan-elastic Oct 9, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -24,4 +24,6 @@ protected EsqlQueryRequest(StreamInput in) throws IOException {
public abstract String query();

public abstract QueryBuilder filter();

public abstract boolean approximate();
}
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@ public final ActionType<Response> action() {

public abstract EsqlQueryRequestBuilder<Request, Response> filter(QueryBuilder filter);

public abstract EsqlQueryRequestBuilder<Request, Response> approximate(boolean approximate);

public abstract EsqlQueryRequestBuilder<Request, Response> allowPartialResults(boolean allowPartialResults);

}
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ public static class CsvSpecParser implements SpecReader.Parser {
private final StringBuilder query = new StringBuilder();
private final StringBuilder data = new StringBuilder();
private final List<String> requiredCapabilities = new ArrayList<>();
private boolean approximate = false;
private CsvTestCase testCase;

private CsvSpecParser() {}
Expand All @@ -44,6 +45,8 @@ public Object parse(String line) {
earlySchema.append(line.substring(SCHEMA_PREFIX.length()).trim());
} else if (line.toLowerCase(Locale.ROOT).startsWith("required_capability:")) {
requiredCapabilities.add(line.substring("required_capability:".length()).trim());
} else if (line.toLowerCase(Locale.ROOT).startsWith("approximate:")) {
approximate = Boolean.parseBoolean(line.substring("approximate:".length()).trim());
} else {
if (line.endsWith(";")) {
// pick up the query
Expand All @@ -52,9 +55,11 @@ public Object parse(String line) {
testCase.query = query.toString();
testCase.earlySchema = earlySchema.toString();
testCase.requiredCapabilities = List.copyOf(requiredCapabilities);
testCase.approximate = approximate;
requiredCapabilities.clear();
earlySchema.setLength(0);
query.setLength(0);
approximate = false;
}
// keep reading the query
else {
Expand Down Expand Up @@ -116,6 +121,7 @@ public static class CsvTestCase {
private final List<Pattern> expectedWarningsRegex = new ArrayList<>();
public boolean ignoreOrder;
public List<String> requiredCapabilities = List.of();
public boolean approximate = false;

/**
* Returns the warning headers expected to be added by the test. To declare such a header, use the `warning:definition` format
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,7 @@ public class CsvTestsDataLoader {
private static final TestDataset MV_TEXT = new TestDataset("mv_text");
private static final TestDataset DENSE_VECTOR = new TestDataset("dense_vector");
private static final TestDataset COLORS = new TestDataset("colors");
private static final TestDataset MANY_NUMBERS = new TestDataset("many_numbers");

public static final Map<String, TestDataset> CSV_DATASET_MAP = Map.ofEntries(
Map.entry(EMPLOYEES.indexName, EMPLOYEES),
Expand Down Expand Up @@ -234,7 +235,8 @@ public class CsvTestsDataLoader {
Map.entry(DENSE_VECTOR.indexName, DENSE_VECTOR),
Map.entry(COLORS.indexName, COLORS),
Map.entry(MULTI_COLUMN_JOINABLE.indexName, MULTI_COLUMN_JOINABLE),
Map.entry(MULTI_COLUMN_JOINABLE_LOOKUP.indexName, MULTI_COLUMN_JOINABLE_LOOKUP)
Map.entry(MULTI_COLUMN_JOINABLE_LOOKUP.indexName, MULTI_COLUMN_JOINABLE_LOOKUP),
Map.entry(MANY_NUMBERS.indexName, MANY_NUMBERS)
);

private static final EnrichConfig LANGUAGES_ENRICH = new EnrichConfig("languages_policy", "enrich-policy-languages.json");
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,194 @@
// Tests focused on query approximation.
//
// The data set "many_numbers" contains 10,000 rows with a multi-valued field
// "value" containing 550 integers each. Aggregated all over rows, there are
// 1x1, 2x2, ..., 999x999 and 1000x1000. It's not one number per row, because
// that's too slow to ingest.
//
// Note: this tests only basic behavior, because of limitations of the CSV tests.
// Most tests assert that the count, average and sum of some values are within a
// range. All ranges are very loose, so that the tests should practically never fail.
// The range checks are done in ES|QL, resulting in one boolean value (is_expected),
// because the CSV tests don't support such assertions.

approximate stats on large data
required_capability: approximate
approximate: true

FROM many_numbers
| MV_EXPAND value
| STATS count=COUNT(), avg=AVG(value), sum=SUM(value)
| EVAL is_expected = count >= 450000 AND count <= 550000 AND
avg >= 600 AND avg <= 750 AND
sum >= 300000000 AND sum <= 380000000
| KEEP is_expected
;

is_expected:boolean
true
;


exact stats on small data
required_capability: approximate
approximate: true

FROM many_numbers
| MV_EXPAND value
| WHERE value <= 25
| STATS count=COUNT(), avg=AVG(value), sum=SUM(value)
;

count:long | avg:double | sum:long
325 | 17.0 | 5525
;


with where
required_capability: approximate
approximate: true

FROM many_numbers
| MV_EXPAND value
| WHERE value >= 500
| STATS count=COUNT(), avg=AVG(value), sum=SUM(value)
| EVAL is_expected = count >= 300000 AND count <= 450000 AND
avg >= 700 AND avg <= 850 AND
sum >= 270000000 AND sum <= 310000000
| KEEP is_expected
;

is_expected:boolean
true
;


with stats where
required_capability: approximate
approximate: true

FROM many_numbers
| MV_EXPAND value
| STATS count=COUNT() WHERE value >= 500,
avg=AVG(value) WHERE value >= 500,
sum=SUM(value) WHERE value >= 500
| EVAL is_expected = count >= 300000 AND count <= 450000 AND
avg >= 700 AND avg <= 850 AND
sum >= 270000000 AND sum <= 310000000
| KEEP is_expected
;

is_expected:boolean
true
;


with sample
required_capability: approximate
approximate: true

FROM many_numbers
| MV_EXPAND value
| SAMPLE 0.5
| STATS count=COUNT(), avg=AVG(value), sum=SUM(value)
| EVAL is_expected = count >= 200000 AND count <= 300000 AND
avg >= 600 AND avg <= 750 AND
sum >= 140000000 AND sum <= 200000000
| KEEP is_expected
;

is_expected:boolean
true
;


with commands inbetween
required_capability: approximate
approximate: true

FROM many_numbers
| MV_EXPAND value
| EVAL value2 = 2 * value
| DROP value
| SORT value2 DESC
| RENAME value2 AS value3
| MV_EXPAND value3
| EVAL value4 = TO_STRING(value3)
| SORT value4 ASC
| GROK value4 "%{NUMBER:value5}"
| EVAL value5 = TO_INTEGER(value5)
| KEEP value3, value5
| STATS count=COUNT(), avg=AVG(value5), sum=SUM(value5)
| EVAL is_expected = count >= 450000 AND count <= 550000 AND
avg >= 1200 AND avg <= 1500 AND
sum >= 600000000 AND sum <= 760000000
| KEEP is_expected
;

is_expected:boolean
true
;


with commands after
required_capability: approximate
approximate: true

FROM many_numbers
| MV_EXPAND value
| STATS count=COUNT(), avg=AVG(value), sum=SUM(value)
| EVAL avg2 = 2 * avg
| LIMIT 10
| MV_EXPAND avg
| SORT count ASC
| EVAL is_expected = count >= 450000 AND count <= 550000 AND
avg2 >= 1200 AND avg2 <= 1500 AND
sum >= 300000000 AND sum <= 380000000
| KEEP is_expected
;

is_expected:boolean
true
;


approximate stats by on large data
required_capability: approximate
approximate: true

FROM many_numbers
| MV_EXPAND value
| STATS count=COUNT() BY value
| SORT value DESC
| LIMIT 5
| EVAL is_expected = count >= 100 AND count <= 2000
| KEEP value, is_expected
;

value:integer | is_expected:boolean
1000 | true
999 | true
998 | true
997 | true
996 | true
;


exact stats by on small data
required_capability: approximate
approximate: true

FROM many_numbers
| MV_EXPAND value
| WHERE value <= 5
| STATS count=COUNT() BY value
| SORT value
;

count:long | value:integer
1 | 1
2 | 2
3 | 3
4 | 4
5 | 5
;
Loading