|
43 | 43 | import org.elasticsearch.xpack.esql.expression.function.scalar.convert.ToLong; |
44 | 44 | import org.elasticsearch.xpack.esql.expression.function.scalar.multivalue.MvAppend; |
45 | 45 | import org.elasticsearch.xpack.esql.expression.function.scalar.multivalue.MvSlice; |
46 | | -import org.elasticsearch.xpack.esql.expression.predicate.logical.And; |
47 | 46 | import org.elasticsearch.xpack.esql.expression.predicate.nulls.IsNotNull; |
48 | 47 | import org.elasticsearch.xpack.esql.expression.predicate.operator.arithmetic.Div; |
49 | 48 | import org.elasticsearch.xpack.esql.expression.predicate.operator.comparison.Equals; |
| 49 | +import org.elasticsearch.xpack.esql.expression.predicate.operator.comparison.GreaterThanOrEqual; |
50 | 50 | import org.elasticsearch.xpack.esql.optimizer.LogicalPlanOptimizer; |
51 | 51 | import org.elasticsearch.xpack.esql.plan.logical.Aggregate; |
52 | 52 | import org.elasticsearch.xpack.esql.plan.logical.ChangePoint; |
@@ -569,6 +569,9 @@ private LogicalPlan approximatePlan(double sampleProbability) { |
569 | 569 | Alias bucketIdField = new Alias(Source.EMPTY, "$bucket_id", bucketIds); |
570 | 570 |
|
571 | 571 | List<NamedExpression> aggregates = new ArrayList<>(); |
| 572 | + Alias sampleSize = new Alias(Source.EMPTY, "$sample_size", COUNT_ALL_ROWS); |
| 573 | + aggregates.add(sampleSize); |
| 574 | + |
572 | 575 | for (NamedExpression aggOrKey : aggregate.aggregates()) { |
573 | 576 | if ((aggOrKey instanceof Alias alias && alias.child() instanceof AggregateFunction) == false) { |
574 | 577 | // This is a grouping key, not an aggregate function. |
@@ -633,10 +636,18 @@ private LogicalPlan approximatePlan(double sampleProbability) { |
633 | 636 | } |
634 | 637 |
|
635 | 638 | // Add the bucket ID, do the aggregations (sampled corrected, including the buckets), |
636 | | - // and filter out rows with empty buckets. |
| 639 | + // and filter out rows with less than 10 sampled values. |
637 | 640 | plan = new Eval(Source.EMPTY, aggregate.child(), List.of(bucketIdField)); |
638 | 641 | plan = aggregate.with(plan, aggregate.groupings(), aggregates); |
| 642 | + plan = new Filter( |
| 643 | + Source.EMPTY, |
| 644 | + plan, |
| 645 | + new GreaterThanOrEqual(Source.EMPTY, sampleSize.toAttribute(), Literal.fromLong(Source.EMPTY, 10L)) |
| 646 | + ); |
639 | 647 |
|
| 648 | + List<Attribute> keepAttributes = new ArrayList<>(plan.output()); |
| 649 | + keepAttributes.remove(sampleSize.toAttribute()); |
| 650 | + plan = new Project(Source.EMPTY, plan, keepAttributes); |
640 | 651 | } else if (encounteredStats.get()) { |
641 | 652 | // After the STATS function, any processing of fields that have buckets, should |
642 | 653 | // also process the buckets, so that confidence intervals for the dependent fields |
@@ -698,7 +709,6 @@ private LogicalPlan approximatePlan(double sampleProbability) { |
698 | 709 |
|
699 | 710 | // Compute the confidence interval for all output fields that have buckets. |
700 | 711 | List<Alias> confidenceIntervalsAndReliable = new ArrayList<>(); |
701 | | - Expression confidenceIntervalsExist = Literal.TRUE; |
702 | 712 | for (Attribute output : logicalPlan.output()) { |
703 | 713 | if (fieldBuckets.containsKey(output.id())) { |
704 | 714 | List<Alias> buckets = fieldBuckets.get(output.id()); |
@@ -743,11 +753,9 @@ private LogicalPlan approximatePlan(double sampleProbability) { |
743 | 753 | new Reliable(Source.EMPTY, bucketsMv, trialCount, bucketCount) |
744 | 754 | ) |
745 | 755 | ); |
746 | | - confidenceIntervalsExist = new And(Source.EMPTY, confidenceIntervalsExist, new IsNotNull(Source.EMPTY, confidenceInterval)); |
747 | 756 | } |
748 | 757 | } |
749 | 758 | approximatePlan = new Eval(Source.EMPTY, approximatePlan, confidenceIntervalsAndReliable); |
750 | | - approximatePlan = new Filter(Source.EMPTY, approximatePlan, confidenceIntervalsExist); |
751 | 759 |
|
752 | 760 | // Finally, drop all bucket fields from the output. |
753 | 761 | Set<Attribute> dropAttributes = fieldBuckets.values() |
|
0 commit comments