Skip to content

Commit 4b49cf3

Browse files
committed
don't sample with prob > 10%
1 parent b8e786d commit 4b49cf3

File tree

2 files changed

+43
-6
lines changed

2 files changed

+43
-6
lines changed

x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/approximate/Approximate.java

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -243,6 +243,12 @@ public record QueryProperties(boolean canDecreaseRowCount, boolean canIncreaseRo
243243
// TODO: set via a query setting
244244
private static final double CONFIDENCE_LEVEL = 0.90;
245245

246+
/**
247+
* Don't sample with a probability higher than this threshold. The cost of
248+
* tracking confidence intervals doesn't outweigh the benefits of sampling.
249+
*/
250+
private static final double SAMPLE_PROBABILITY_THRESHOLD = 0.1;
251+
246252
/**
247253
* The number of times (trials) the sampled rows are divided into buckets.
248254
*/
@@ -442,8 +448,8 @@ private ActionListener<Result> sourceCountListener(ActionListener<Result> listen
442448
sourceRowCount = rowCount(countResult);
443449
logger.debug("sourceCountPlan result: {} rows", sourceRowCount);
444450
double sampleProbability = sourceRowCount <= SAMPLE_ROW_COUNT ? 1.0 : (double) SAMPLE_ROW_COUNT / sourceRowCount;
445-
if (queryProperties.canIncreaseRowCount == false && sampleProbability == 1.0) {
446-
// If the query cannot increase the number of rows, and the sample probability is 1.0,
451+
if (queryProperties.canIncreaseRowCount == false && sampleProbability > SAMPLE_PROBABILITY_THRESHOLD) {
452+
// If the query cannot increase the number of rows, and the sample probability is large,
447453
// we can directly run the original query without sampling.
448454
runner.run(toPhysicalPlan.apply(logicalPlan), configuration, foldContext, listener);
449455
} else if (queryProperties.canIncreaseRowCount == false && queryProperties.canDecreaseRowCount == false) {
@@ -510,7 +516,7 @@ private ActionListener<Result> countListener(double sampleProbability, ActionLis
510516
long rowCount = rowCount(countResult);
511517
logger.debug("countPlan result (p={}): {} rows", sampleProbability, rowCount);
512518
double newSampleProbability = Math.min(1.0, sampleProbability * SAMPLE_ROW_COUNT / Math.max(1, rowCount));
513-
if (rowCount <= SAMPLE_ROW_COUNT / 2 && newSampleProbability < 1.0) {
519+
if (rowCount <= SAMPLE_ROW_COUNT / 2 && newSampleProbability < SAMPLE_PROBABILITY_THRESHOLD) {
514520
runner.run(
515521
toPhysicalPlan.apply(countPlan(newSampleProbability)),
516522
configuration,
@@ -583,7 +589,7 @@ private long rowCount(Result countResult) {
583589
* </pre>
584590
*/
585591
private LogicalPlan approximatePlan(double sampleProbability) {
586-
if (sampleProbability == 1.0) {
592+
if (sampleProbability > SAMPLE_PROBABILITY_THRESHOLD) {
587593
logger.debug("using original plan (too few rows)");
588594
return logicalPlan;
589595
}

x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/approximate/ApproximateTests.java

Lines changed: 33 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -311,7 +311,7 @@ public void testCountPlan_smallDataAfterMvExpanding() throws Exception {
311311
// - one pass to get the number of expanded rows (which determines the sample probability)
312312
// - one pass to execute the original query
313313
assertThat(runner.invocations, hasSize(4));
314-
assertThat(runner.invocations.get(0), allOf(hasMvExpand("emp_no")));
314+
assertThat(runner.invocations.get(0), allOf(hasMvExpand("emp_no"), not(hasSample())));
315315
assertThat(runner.invocations.get(1), allOf(not(hasMvExpand("emp_no")), not(hasSample())));
316316
assertThat(runner.invocations.get(2), allOf(hasMvExpand("emp_no"), not(hasSample())));
317317
assertThat(runner.invocations.get(3), allOf(hasMvExpand("emp_no"), not(hasSample())));
@@ -327,7 +327,7 @@ public void testCountPlan_largeDataAfterMvExpanding() throws Exception {
327327
// - one pass to get the number of expanded rows (which determines the sample probability)
328328
// - one pass to approximate the query
329329
assertThat(runner.invocations, hasSize(4));
330-
assertThat(runner.invocations.get(0), allOf(hasMvExpand("emp_no")));
330+
assertThat(runner.invocations.get(0), allOf(hasMvExpand("emp_no"), not(hasSample())));
331331
assertThat(runner.invocations.get(1), allOf(not(hasMvExpand("emp_no")), not(hasSample())));
332332
assertThat(runner.invocations.get(2), allOf(hasMvExpand("emp_no"), not(hasSample())));
333333
assertThat(runner.invocations.get(3), allOf(hasMvExpand("emp_no"), hasSample(1e-4)));
@@ -349,6 +349,37 @@ public void testCountPlan_largeDataBeforeMvExpanding() throws Exception {
349349
assertThat(runner.invocations.get(3), allOf(hasMvExpand("emp_no"), hasSample(1e-7)));
350350
}
351351

352+
public void testCountPlan_sampleProbabilityThreshold_noFilter() throws Exception {
353+
TestRunner runner = new TestRunner(500_000, 500_000);
354+
Approximate approximate = createApproximate("FROM test | STATS SUM(emp_no)", runner);
355+
approximate.approximate(TestRunner.resultCloser);
356+
// This plan needs three passes:
357+
// - one pass to check whether it's an ES stats query (always no for the test runner)
358+
// - one pass to get the total number of rows
359+
// - one pass to execute the original query (because the sample probability is 20%)
360+
assertThat(runner.invocations, hasSize(3));
361+
assertThat(runner.invocations.get(0), allOf(not(hasSample())));
362+
assertThat(runner.invocations.get(1), allOf(not(hasSample())));
363+
assertThat(runner.invocations.get(1), allOf(not(hasSample())));
364+
}
365+
366+
public void testCountPlan_sampleProbabilityThreshold_withFilter() throws Exception {
367+
TestRunner runner = new TestRunner(1_000_000_000_000L, 200_000);
368+
Approximate approximate = createApproximate("FROM test | WHERE emp_no > 1 | STATS SUM(emp_no)", runner);
369+
approximate.approximate(TestRunner.resultCloser);
370+
// This plan needs three passes:
371+
// - one pass to check whether it's an ES stats query (always no for the test runner)
372+
// - one pass to get the total number of rows
373+
// - two passes to get the number of filtered rows (which determines the sample probability)
374+
// - one pass to execute the original query (because the sample probability is 50%)
375+
assertThat(runner.invocations, hasSize(5));
376+
assertThat(runner.invocations.get(0), allOf(not(hasSample()), hasFilter("emp_no")));
377+
assertThat(runner.invocations.get(1), allOf(not(hasSample()), not(hasFilter("emp_no"))));
378+
assertThat(runner.invocations.get(2), allOf(hasSample(1e-7), hasFilter("emp_no")));
379+
assertThat(runner.invocations.get(3), allOf(hasSample(1e-2), hasFilter("emp_no")));
380+
assertThat(runner.invocations.get(4), allOf(not(hasSample()), hasFilter("emp_no")));
381+
}
382+
352383
public void testApproximatePlan_createsConfidenceInterval_withoutGrouping() throws Exception {
353384
TestRunner runner = new TestRunner(1_000_000_000, 1_000_000_000);
354385
Approximate approximate = createApproximate("FROM test | STATS COUNT(), SUM(emp_no)", runner);

0 commit comments

Comments
 (0)