Skip to content

Commit 2287b10

Browse files
committed
filter rows with < 10 sampled values
1 parent 5b73ffa commit 2287b10

File tree

2 files changed

+17
-5
lines changed

2 files changed

+17
-5
lines changed

x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/approximate/Approximate.java

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -43,10 +43,10 @@
4343
import org.elasticsearch.xpack.esql.expression.function.scalar.convert.ToLong;
4444
import org.elasticsearch.xpack.esql.expression.function.scalar.multivalue.MvAppend;
4545
import org.elasticsearch.xpack.esql.expression.function.scalar.multivalue.MvSlice;
46-
import org.elasticsearch.xpack.esql.expression.predicate.logical.And;
4746
import org.elasticsearch.xpack.esql.expression.predicate.nulls.IsNotNull;
4847
import org.elasticsearch.xpack.esql.expression.predicate.operator.arithmetic.Div;
4948
import org.elasticsearch.xpack.esql.expression.predicate.operator.comparison.Equals;
49+
import org.elasticsearch.xpack.esql.expression.predicate.operator.comparison.GreaterThanOrEqual;
5050
import org.elasticsearch.xpack.esql.optimizer.LogicalPlanOptimizer;
5151
import org.elasticsearch.xpack.esql.plan.logical.Aggregate;
5252
import org.elasticsearch.xpack.esql.plan.logical.ChangePoint;
@@ -569,6 +569,9 @@ private LogicalPlan approximatePlan(double sampleProbability) {
569569
Alias bucketIdField = new Alias(Source.EMPTY, "$bucket_id", bucketIds);
570570

571571
List<NamedExpression> aggregates = new ArrayList<>();
572+
Alias sampleSize = new Alias(Source.EMPTY, "$sample_size", COUNT_ALL_ROWS);
573+
aggregates.add(sampleSize);
574+
572575
for (NamedExpression aggOrKey : aggregate.aggregates()) {
573576
if ((aggOrKey instanceof Alias alias && alias.child() instanceof AggregateFunction) == false) {
574577
// This is a grouping key, not an aggregate function.
@@ -633,10 +636,18 @@ private LogicalPlan approximatePlan(double sampleProbability) {
633636
}
634637

635638
// Add the bucket ID, do the aggregations (sampled corrected, including the buckets),
636-
// and filter out rows with empty buckets.
639+
// and filter out rows with less than 10 sampled values.
637640
plan = new Eval(Source.EMPTY, aggregate.child(), List.of(bucketIdField));
638641
plan = aggregate.with(plan, aggregate.groupings(), aggregates);
642+
plan = new Filter(
643+
Source.EMPTY,
644+
plan,
645+
new GreaterThanOrEqual(Source.EMPTY, sampleSize.toAttribute(), Literal.fromLong(Source.EMPTY, 10L))
646+
);
639647

648+
List<Attribute> keepAttributes = new ArrayList<>(plan.output());
649+
keepAttributes.remove(sampleSize.toAttribute());
650+
plan = new Project(Source.EMPTY, plan, keepAttributes);
640651
} else if (encounteredStats.get()) {
641652
// After the STATS function, any processing of fields that have buckets, should
642653
// also process the buckets, so that confidence intervals for the dependent fields
@@ -698,7 +709,6 @@ private LogicalPlan approximatePlan(double sampleProbability) {
698709

699710
// Compute the confidence interval for all output fields that have buckets.
700711
List<Alias> confidenceIntervalsAndReliable = new ArrayList<>();
701-
Expression confidenceIntervalsExist = Literal.TRUE;
702712
for (Attribute output : logicalPlan.output()) {
703713
if (fieldBuckets.containsKey(output.id())) {
704714
List<Alias> buckets = fieldBuckets.get(output.id());
@@ -743,11 +753,9 @@ private LogicalPlan approximatePlan(double sampleProbability) {
743753
new Reliable(Source.EMPTY, bucketsMv, trialCount, bucketCount)
744754
)
745755
);
746-
confidenceIntervalsExist = new And(Source.EMPTY, confidenceIntervalsExist, new IsNotNull(Source.EMPTY, confidenceInterval));
747756
}
748757
}
749758
approximatePlan = new Eval(Source.EMPTY, approximatePlan, confidenceIntervalsAndReliable);
750-
approximatePlan = new Filter(Source.EMPTY, approximatePlan, confidenceIntervalsExist);
751759

752760
// Finally, drop all bucket fields from the output.
753761
Set<Attribute> dropAttributes = fieldBuckets.values()
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
package org.elasticsearch.xpack.esql.expression.function.scalar.approximate;
2+
3+
public class ConfidenceIntervalTests {
4+
}

0 commit comments

Comments
 (0)