1010import org .elasticsearch .xpack .esql .core .expression .Alias ;
1111import org .elasticsearch .xpack .esql .core .expression .Attribute ;
1212import org .elasticsearch .xpack .esql .core .expression .Expression ;
13+ import org .elasticsearch .xpack .esql .core .expression .Literal ;
1314import org .elasticsearch .xpack .esql .core .expression .NamedExpression ;
15+ import org .elasticsearch .xpack .esql .core .expression .ReferenceAttribute ;
16+ import org .elasticsearch .xpack .esql .core .tree .Source ;
17+ import org .elasticsearch .xpack .esql .core .type .DataType ;
1418import org .elasticsearch .xpack .esql .core .util .Holder ;
1519import org .elasticsearch .xpack .esql .expression .function .aggregate .AggregateFunction ;
1620import org .elasticsearch .xpack .esql .expression .function .grouping .GroupingFunction ;
21+ import org .elasticsearch .xpack .esql .expression .function .scalar .date .DateFormat ;
22+ import org .elasticsearch .xpack .esql .expression .function .scalar .date .DateTrunc ;
1723import org .elasticsearch .xpack .esql .plan .logical .Aggregate ;
1824import org .elasticsearch .xpack .esql .plan .logical .Eval ;
1925import org .elasticsearch .xpack .esql .plan .logical .LogicalPlan ;
26+ import org .elasticsearch .xpack .esql .plan .logical .Project ;
2027
28+ import java .time .Period ;
29+ import java .time .format .DateTimeFormatter ;
30+ import java .time .temporal .ChronoField ;
31+ import java .time .temporal .ChronoUnit ;
32+ import java .time .temporal .IsoFields ;
2133import java .util .ArrayList ;
2234import java .util .HashMap ;
2335import java .util .List ;
2436import java .util .Map ;
2537
2638/**
27- * Replace nested expressions inside a {@link Aggregate} with synthetic eval.
39+ * An optimizer rule that performs two main optimizations:
40+ * 1. Replaces nested expressions inside a {@link Aggregate} with synthetic eval
41+ * 2. Optimizes DATE_FORMAT function calls in GROUP BY clauses with more efficient DATE_TRUNC operations
42+ * <p>
43+ * For nested expressions in aggregates:
2844 * {@code STATS SUM(a + 1) BY x % 2}
2945 * becomes
3046 * {@code EVAL `a + 1` = a + 1, `x % 2` = x % 2 | STATS SUM(`a+1`_ref) BY `x % 2`_ref}
3147 * and
3248 * {@code INLINESTATS SUM(a + 1) BY x % 2}
3349 * becomes
3450 * {@code EVAL `a + 1` = a + 1, `x % 2` = x % 2 | INLINESTATS SUM(`a+1`_ref) BY `x % 2`_ref}
51+ * <p>
52+ * For date formatting optimization:
53+ * {@code STATS sum = SUM(value) BY month = DATE_FORMAT("yyyy-MM", timestamp) }
54+ * can be optimized to
55+ * {@code STATS sum = SUM(value) BY month1 = DATE_TRUNC(1 month, timestamp) | EVAL month = DATE_FORMAT("yyyy-MM", month1) | KEEP sum, month}
56+ * which is more efficient for grouping operations.
57+ * <p>
58+ * The date formatting optimization analyzes the format pattern and maps it to the smallest possible time interval
59+ * that preserves the grouping semantics. Supported intervals range from nanoseconds to years, including special
60+ * cases like quarters and weeks.
61+ * <p>
62+ * This date optimization not only improves performance but also ensures correctness in time-based grouping:
63+ * DATE_TRUNC properly handles timezone and daylight saving time (DST) transitions when using Period or Duration
64+ * intervals, while DATE_FORMAT does not account for these timezone-related considerations.
3565 */
3666public final class ReplaceAggregateNestedExpressionWithEval extends OptimizerRules .OptimizerRule <Aggregate > {
3767
@@ -41,8 +71,13 @@ protected LogicalPlan rule(Aggregate aggregate) {
4171 Map <String , Attribute > evalNames = new HashMap <>();
4272 Map <GroupingFunction , Attribute > groupingAttributes = new HashMap <>();
4373 List <Expression > newGroupings = new ArrayList <>(aggregate .groupings ());
74+ List <NamedExpression > newProjections = new ArrayList <>();
75+
4476 boolean groupingChanged = false ;
4577
78+ List <Alias > newEvals = new ArrayList <>();
79+ int [] counter = new int [] { 0 };
80+
4681 // start with the groupings since the aggs might reuse/reference them
4782 for (int i = 0 , s = newGroupings .size (); i < s ; i ++) {
4883 Expression g = newGroupings .get (i );
@@ -60,6 +95,32 @@ protected LogicalPlan rule(Aggregate aggregate) {
6095 // Move the alias into an eval and replace it with its attribute.
6196 groupingChanged = true ;
6297 var attr = as .toAttribute ();
98+ if (asChild instanceof DateFormat df ) {
99+ // Extract the format pattern and field from DateFormat
100+ Literal format = (Literal ) df .children ().getFirst ();
101+ Expression field = df .children ().get (1 );
102+
103+ // Try to convert the format pattern to a minimal time interval
104+ // This optimization attempts to simplify date formatting to DATE_TRUNC operations
105+ Literal interval = formatToMinimalInterval ((String ) format .value (), g .source ());
106+ // If we can optimize the format to use DATE_TRUNC
107+ if (interval != null ) {
108+ // Create a new DateTrunc operation with the optimized interval
109+ DateTrunc dateTrunc = new DateTrunc (df .source (), interval , field );
110+ // Create a synthetic alias for the DateTrunc operation
111+ var alias = new Alias (as .source (), syntheticName (dateTrunc , as , counter [0 ]++), dateTrunc , null , true );
112+ attr = alias .toAttribute ();
113+ // Replace the original DateFormat children with the new format and attribute
114+ Expression expression = df .replaceChildren (List .of (format , attr ));
115+ // Create a new eval alias for the optimized expression
116+ Alias newEval = as .replaceChild (expression );
117+ newEvals .add (newEval );
118+ newProjections .add (newEval .toAttribute ());
119+ evalNames .put (as .name (), attr );
120+ as = alias ;
121+ }
122+ }
123+
63124 evals .add (as );
64125 evalNames .put (as .name (), attr );
65126 newGroupings .set (i , attr );
@@ -80,7 +141,6 @@ protected LogicalPlan rule(Aggregate aggregate) {
80141 expToAttribute .put (a .child ().canonical (), a .toAttribute ());
81142 }
82143
83- int [] counter = new int [] { 0 };
84144 // for the aggs make sure to unwrap the agg function and check the existing groupings
85145 for (NamedExpression agg : aggs ) {
86146 NamedExpression a = (NamedExpression ) agg .transformDown (Alias .class , as -> {
@@ -113,8 +173,16 @@ protected LogicalPlan rule(Aggregate aggregate) {
113173
114174 return as .replaceChild (replaced );
115175 });
116-
117- newAggs .add (a );
176+ if (groupingChanged && agg instanceof ReferenceAttribute ra ) {
177+ Attribute ref = evalNames .get (ra .name ());
178+ if (ref != null ) {
179+ aggsChanged .set (true );
180+ newAggs .add (ref );
181+ }
182+ } else {
183+ newAggs .add (a );
184+ newProjections .add (a .toAttribute ());
185+ }
118186 }
119187
120188 if (evals .size () > 0 ) {
@@ -124,6 +192,10 @@ protected LogicalPlan rule(Aggregate aggregate) {
124192 var newEval = new Eval (aggregate .source (), aggregate .child (), evals );
125193 aggregate = aggregate .with (newEval , groupings , aggregates );
126194 }
195+ if (newEvals .size () > 0 ) {
196+ Eval eval = new Eval (aggregate .source (), aggregate , newEvals );
197+ return new Project (aggregate .source (), eval , newProjections );
198+ }
127199
128200 return aggregate ;
129201 }
@@ -192,4 +264,40 @@ private static Expression transformAggregateFunction(
192264 private static String syntheticName (Expression expression , Expression func , int counter ) {
193265 return TemporaryNameUtils .temporaryName (expression , func , counter );
194266 }
267+
268+ private static Literal formatToMinimalInterval (String format , Source source ) {
269+ try {
270+ DateTimeFormatter formatter = DateTimeFormatter .ofPattern (format );
271+ String formatterAsString = formatter .toString ();
272+ if (formatterAsString .contains (ChronoField .NANO_OF_SECOND .toString ())
273+ || formatterAsString .contains (ChronoField .NANO_OF_DAY .toString ())) {
274+ return new Literal (source , ChronoUnit .NANOS .getDuration (), DataType .TIME_DURATION );
275+ } else if (formatterAsString .contains (ChronoField .MILLI_OF_DAY .toString ())) {
276+ return new Literal (source , ChronoUnit .MILLIS .getDuration (), DataType .TIME_DURATION );
277+ } else if (formatterAsString .contains (ChronoField .SECOND_OF_MINUTE .toString ())) {
278+ return new Literal (source , ChronoUnit .SECONDS .getDuration (), DataType .TIME_DURATION );
279+ } else if (formatterAsString .contains (ChronoField .MINUTE_OF_HOUR .toString ())) {
280+ return new Literal (source , ChronoUnit .MINUTES .getDuration (), DataType .TIME_DURATION );
281+ } else if (formatterAsString .contains (ChronoField .HOUR_OF_DAY .toString ())
282+ || formatterAsString .contains (ChronoField .CLOCK_HOUR_OF_DAY .toString ())
283+ || formatterAsString .contains (ChronoField .HOUR_OF_AMPM .toString ())
284+ || formatterAsString .contains (ChronoField .CLOCK_HOUR_OF_AMPM .toString ())) {
285+ return new Literal (source , ChronoUnit .HOURS .getDuration (), DataType .TIME_DURATION );
286+ } else if (formatterAsString .contains (ChronoField .AMPM_OF_DAY .toString ())) {
287+ return new Literal (source , ChronoUnit .HALF_DAYS , DataType .TIME_DURATION );
288+ } else if (formatterAsString .contains (ChronoField .DAY_OF_WEEK .toString ())) {
289+ return new Literal (source , Period .ofDays (1 ), DataType .DATE_PERIOD );
290+ } else if (formatterAsString .contains (ChronoField .ALIGNED_WEEK_OF_MONTH .toString ())) {
291+ return new Literal (source , Period .ofDays (7 ), DataType .DATE_PERIOD );
292+ } else if (formatterAsString .contains (ChronoField .MONTH_OF_YEAR .toString ())) {
293+ return new Literal (source , Period .ofMonths (1 ), DataType .DATE_PERIOD );
294+ } else if (formatterAsString .contains (IsoFields .QUARTER_OF_YEAR .toString ())) {
295+ return new Literal (source , Period .ofMonths (3 ), DataType .DATE_PERIOD );
296+ } else if (formatterAsString .contains (ChronoField .YEAR_OF_ERA .toString ())
297+ || formatterAsString .contains (ChronoField .YEAR .toString ())) {
298+ return new Literal (source , Period .ofYears (1 ), DataType .DATE_PERIOD );
299+ }
300+ } catch (IllegalArgumentException ignored ) {}
301+ return null ;
302+ }
195303}
0 commit comments