Skip to content

Commit c003dfd

Browse files
committed
Replace in ReplaceAggregateNestedExpressionWithEval
1 parent 98bc563 commit c003dfd

File tree

4 files changed

+164
-122
lines changed

4 files changed

+164
-122
lines changed

x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/optimizer/LogicalPlanOptimizer.java

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,6 @@
4848
import org.elasticsearch.xpack.esql.optimizer.rules.logical.ReplaceAggregateAggExpressionWithEval;
4949
import org.elasticsearch.xpack.esql.optimizer.rules.logical.ReplaceAggregateNestedExpressionWithEval;
5050
import org.elasticsearch.xpack.esql.optimizer.rules.logical.ReplaceAliasingEvalWithProject;
51-
import org.elasticsearch.xpack.esql.optimizer.rules.logical.ReplaceGroupingByDateFormatWithDateTrunc;
5251
import org.elasticsearch.xpack.esql.optimizer.rules.logical.ReplaceLimitAndSortAsTopN;
5352
import org.elasticsearch.xpack.esql.optimizer.rules.logical.ReplaceOrderByExpressionWithEval;
5453
import org.elasticsearch.xpack.esql.optimizer.rules.logical.ReplaceRegexMatch;
@@ -138,7 +137,6 @@ protected static Batch<LogicalPlan> substitutions() {
138137
// Needs to occur before ReplaceAggregateAggExpressionWithEval, which will update the functions, losing the filter.
139138
new SubstituteFilteredExpression(),
140139
new RemoveStatsOverride(),
141-
new ReplaceGroupingByDateFormatWithDateTrunc(),
142140
// first extract nested expressions inside aggs
143141
new ReplaceAggregateNestedExpressionWithEval(),
144142
// then extract nested aggs top-level

x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/optimizer/rules/logical/ReplaceAggregateNestedExpressionWithEval.java

Lines changed: 112 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,28 +10,58 @@
1010
import org.elasticsearch.xpack.esql.core.expression.Alias;
1111
import org.elasticsearch.xpack.esql.core.expression.Attribute;
1212
import org.elasticsearch.xpack.esql.core.expression.Expression;
13+
import org.elasticsearch.xpack.esql.core.expression.Literal;
1314
import org.elasticsearch.xpack.esql.core.expression.NamedExpression;
15+
import org.elasticsearch.xpack.esql.core.expression.ReferenceAttribute;
16+
import org.elasticsearch.xpack.esql.core.tree.Source;
17+
import org.elasticsearch.xpack.esql.core.type.DataType;
1418
import org.elasticsearch.xpack.esql.core.util.Holder;
1519
import org.elasticsearch.xpack.esql.expression.function.aggregate.AggregateFunction;
1620
import org.elasticsearch.xpack.esql.expression.function.grouping.GroupingFunction;
21+
import org.elasticsearch.xpack.esql.expression.function.scalar.date.DateFormat;
22+
import org.elasticsearch.xpack.esql.expression.function.scalar.date.DateTrunc;
1723
import org.elasticsearch.xpack.esql.plan.logical.Aggregate;
1824
import org.elasticsearch.xpack.esql.plan.logical.Eval;
1925
import org.elasticsearch.xpack.esql.plan.logical.LogicalPlan;
26+
import org.elasticsearch.xpack.esql.plan.logical.Project;
2027

28+
import java.time.Period;
29+
import java.time.format.DateTimeFormatter;
30+
import java.time.temporal.ChronoField;
31+
import java.time.temporal.ChronoUnit;
32+
import java.time.temporal.IsoFields;
2133
import java.util.ArrayList;
2234
import java.util.HashMap;
2335
import java.util.List;
2436
import java.util.Map;
2537

2638
/**
27-
* Replace nested expressions inside a {@link Aggregate} with synthetic eval.
39+
* An optimizer rule that performs two main optimizations:
40+
* 1. Replaces nested expressions inside a {@link Aggregate} with synthetic eval
41+
* 2. Optimizes DATE_FORMAT function calls in GROUP BY clauses with more efficient DATE_TRUNC operations
42+
* <p>
43+
* For nested expressions in aggregates:
2844
* {@code STATS SUM(a + 1) BY x % 2}
2945
* becomes
3046
* {@code EVAL `a + 1` = a + 1, `x % 2` = x % 2 | STATS SUM(`a+1`_ref) BY `x % 2`_ref}
3147
* and
3248
* {@code INLINESTATS SUM(a + 1) BY x % 2}
3349
* becomes
3450
* {@code EVAL `a + 1` = a + 1, `x % 2` = x % 2 | INLINESTATS SUM(`a+1`_ref) BY `x % 2`_ref}
51+
* <p>
52+
* For date formatting optimization:
53+
* {@code STATS sum = SUM(value) BY month = DATE_FORMAT("yyyy-MM", timestamp) }
54+
* can be optimized to
55+
* {@code STATS sum = SUM(value) BY month1 = DATE_TRUNC(1 month, timestamp) | EVAL month = DATE_FORMAT("yyyy-MM", month1) | KEEP sum, month}
56+
* which is more efficient for grouping operations.
57+
* <p>
58+
* The date formatting optimization analyzes the format pattern and maps it to the smallest possible time interval
59+
* that preserves the grouping semantics. Supported intervals range from nanoseconds to years, including special
60+
* cases like quarters and weeks.
61+
* <p>
62+
* This date optimization not only improves performance but also ensures correctness in time-based grouping:
63+
* DATE_TRUNC properly handles timezone and daylight saving time (DST) transitions when using Period or Duration
64+
* intervals, while DATE_FORMAT does not account for these timezone-related considerations.
3565
*/
3666
public final class ReplaceAggregateNestedExpressionWithEval extends OptimizerRules.OptimizerRule<Aggregate> {
3767

@@ -41,8 +71,13 @@ protected LogicalPlan rule(Aggregate aggregate) {
4171
Map<String, Attribute> evalNames = new HashMap<>();
4272
Map<GroupingFunction, Attribute> groupingAttributes = new HashMap<>();
4373
List<Expression> newGroupings = new ArrayList<>(aggregate.groupings());
74+
List<NamedExpression> newProjections = new ArrayList<>();
75+
4476
boolean groupingChanged = false;
4577

78+
List<Alias> newEvals = new ArrayList<>();
79+
int[] counter = new int[] { 0 };
80+
4681
// start with the groupings since the aggs might reuse/reference them
4782
for (int i = 0, s = newGroupings.size(); i < s; i++) {
4883
Expression g = newGroupings.get(i);
@@ -60,6 +95,32 @@ protected LogicalPlan rule(Aggregate aggregate) {
6095
// Move the alias into an eval and replace it with its attribute.
6196
groupingChanged = true;
6297
var attr = as.toAttribute();
98+
if (asChild instanceof DateFormat df) {
99+
// Extract the format pattern and field from DateFormat
100+
Literal format = (Literal) df.children().getFirst();
101+
Expression field = df.children().get(1);
102+
103+
// Try to convert the format pattern to a minimal time interval
104+
// This optimization attempts to simplify date formatting to DATE_TRUNC operations
105+
Literal interval = formatToMinimalInterval((String) format.value(), g.source());
106+
// If we can optimize the format to use DATE_TRUNC
107+
if (interval != null) {
108+
// Create a new DateTrunc operation with the optimized interval
109+
DateTrunc dateTrunc = new DateTrunc(df.source(), interval, field);
110+
// Create a synthetic alias for the DateTrunc operation
111+
var alias = new Alias(as.source(), syntheticName(dateTrunc, as, counter[0]++), dateTrunc, null, true);
112+
attr = alias.toAttribute();
113+
// Replace the original DateFormat children with the new format and attribute
114+
Expression expression = df.replaceChildren(List.of(format, attr));
115+
// Create a new eval alias for the optimized expression
116+
Alias newEval = as.replaceChild(expression);
117+
newEvals.add(newEval);
118+
newProjections.add(newEval.toAttribute());
119+
evalNames.put(as.name(), attr);
120+
as = alias;
121+
}
122+
}
123+
63124
evals.add(as);
64125
evalNames.put(as.name(), attr);
65126
newGroupings.set(i, attr);
@@ -80,7 +141,6 @@ protected LogicalPlan rule(Aggregate aggregate) {
80141
expToAttribute.put(a.child().canonical(), a.toAttribute());
81142
}
82143

83-
int[] counter = new int[] { 0 };
84144
// for the aggs make sure to unwrap the agg function and check the existing groupings
85145
for (NamedExpression agg : aggs) {
86146
NamedExpression a = (NamedExpression) agg.transformDown(Alias.class, as -> {
@@ -113,8 +173,16 @@ protected LogicalPlan rule(Aggregate aggregate) {
113173

114174
return as.replaceChild(replaced);
115175
});
116-
117-
newAggs.add(a);
176+
if (groupingChanged && agg instanceof ReferenceAttribute ra) {
177+
Attribute ref = evalNames.get(ra.name());
178+
if (ref != null) {
179+
aggsChanged.set(true);
180+
newAggs.add(ref);
181+
}
182+
} else {
183+
newAggs.add(a);
184+
newProjections.add(a.toAttribute());
185+
}
118186
}
119187

120188
if (evals.size() > 0) {
@@ -124,6 +192,10 @@ protected LogicalPlan rule(Aggregate aggregate) {
124192
var newEval = new Eval(aggregate.source(), aggregate.child(), evals);
125193
aggregate = aggregate.with(newEval, groupings, aggregates);
126194
}
195+
if (newEvals.size() > 0) {
196+
Eval eval = new Eval(aggregate.source(), aggregate, newEvals);
197+
return new Project(aggregate.source(), eval, newProjections);
198+
}
127199

128200
return aggregate;
129201
}
@@ -192,4 +264,40 @@ private static Expression transformAggregateFunction(
192264
private static String syntheticName(Expression expression, Expression func, int counter) {
193265
return TemporaryNameUtils.temporaryName(expression, func, counter);
194266
}
267+
268+
private static Literal formatToMinimalInterval(String format, Source source) {
269+
try {
270+
DateTimeFormatter formatter = DateTimeFormatter.ofPattern(format);
271+
String formatterAsString = formatter.toString();
272+
if (formatterAsString.contains(ChronoField.NANO_OF_SECOND.toString())
273+
|| formatterAsString.contains(ChronoField.NANO_OF_DAY.toString())) {
274+
return new Literal(source, ChronoUnit.NANOS.getDuration(), DataType.TIME_DURATION);
275+
} else if (formatterAsString.contains(ChronoField.MILLI_OF_DAY.toString())) {
276+
return new Literal(source, ChronoUnit.MILLIS.getDuration(), DataType.TIME_DURATION);
277+
} else if (formatterAsString.contains(ChronoField.SECOND_OF_MINUTE.toString())) {
278+
return new Literal(source, ChronoUnit.SECONDS.getDuration(), DataType.TIME_DURATION);
279+
} else if (formatterAsString.contains(ChronoField.MINUTE_OF_HOUR.toString())) {
280+
return new Literal(source, ChronoUnit.MINUTES.getDuration(), DataType.TIME_DURATION);
281+
} else if (formatterAsString.contains(ChronoField.HOUR_OF_DAY.toString())
282+
|| formatterAsString.contains(ChronoField.CLOCK_HOUR_OF_DAY.toString())
283+
|| formatterAsString.contains(ChronoField.HOUR_OF_AMPM.toString())
284+
|| formatterAsString.contains(ChronoField.CLOCK_HOUR_OF_AMPM.toString())) {
285+
return new Literal(source, ChronoUnit.HOURS.getDuration(), DataType.TIME_DURATION);
286+
} else if (formatterAsString.contains(ChronoField.AMPM_OF_DAY.toString())) {
287+
return new Literal(source, ChronoUnit.HALF_DAYS, DataType.TIME_DURATION);
288+
} else if (formatterAsString.contains(ChronoField.DAY_OF_WEEK.toString())) {
289+
return new Literal(source, Period.ofDays(1), DataType.DATE_PERIOD);
290+
} else if (formatterAsString.contains(ChronoField.ALIGNED_WEEK_OF_MONTH.toString())) {
291+
return new Literal(source, Period.ofDays(7), DataType.DATE_PERIOD);
292+
} else if (formatterAsString.contains(ChronoField.MONTH_OF_YEAR.toString())) {
293+
return new Literal(source, Period.ofMonths(1), DataType.DATE_PERIOD);
294+
} else if (formatterAsString.contains(IsoFields.QUARTER_OF_YEAR.toString())) {
295+
return new Literal(source, Period.ofMonths(3), DataType.DATE_PERIOD);
296+
} else if (formatterAsString.contains(ChronoField.YEAR_OF_ERA.toString())
297+
|| formatterAsString.contains(ChronoField.YEAR.toString())) {
298+
return new Literal(source, Period.ofYears(1), DataType.DATE_PERIOD);
299+
}
300+
} catch (IllegalArgumentException ignored) {}
301+
return null;
302+
}
195303
}

x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/optimizer/rules/logical/ReplaceGroupingByDateFormatWithDateTrunc.java

Lines changed: 0 additions & 116 deletions
This file was deleted.

0 commit comments

Comments
 (0)