diff --git a/docs/changelog/129277.yaml b/docs/changelog/129277.yaml new file mode 100644 index 0000000000000..4dfdacf6d5f2f --- /dev/null +++ b/docs/changelog/129277.yaml @@ -0,0 +1,6 @@ +pr: 129277 +summary: "ESQL: Replace grouping by DateFormat with DateTrunc" +area: ES|QL +type: enhancement +issues: + - 114772 diff --git a/x-pack/plugin/esql/qa/testFixtures/src/main/resources/inlinestats.csv-spec b/x-pack/plugin/esql/qa/testFixtures/src/main/resources/inlinestats.csv-spec index 06c1d2d92b253..c682a636d98e1 100644 --- a/x-pack/plugin/esql/qa/testFixtures/src/main/resources/inlinestats.csv-spec +++ b/x-pack/plugin/esql/qa/testFixtures/src/main/resources/inlinestats.csv-spec @@ -4392,3 +4392,313 @@ row a = 1 c:long 1 ; + + +inlineStatsDateFormatYear +required_capability: inline_stats + +FROM employees +| KEEP emp_no, hire_date +| INLINE STATS count = COUNT(*) BY year_hired = DATE_FORMAT("yyyy", hire_date) +| SORT emp_no +| LIMIT 5 +; + +emp_no:integer | hire_date:datetime | count:long | year_hired:keyword +10001 | 1986-06-26T00:00:00.000Z | 11 | 1986 +10002 | 1985-11-21T00:00:00.000Z | 11 | 1985 +10003 | 1986-08-28T00:00:00.000Z | 11 | 1986 +10004 | 1986-12-01T00:00:00.000Z | 11 | 1986 +10005 | 1989-09-12T00:00:00.000Z | 13 | 1989 +; + +inlineStatsDateFormatMonth +required_capability: inline_stats + +FROM employees +| KEEP emp_no, hire_date, salary +| INLINE STATS avg_salary = AVG(salary) BY month_hired = DATE_FORMAT("yyyy-MM", hire_date) +| EVAL avg_salary = ROUND(avg_salary) +| SORT emp_no +| LIMIT 5 +; + +emp_no:integer | hire_date:datetime | salary:integer | month_hired:keyword | avg_salary:double +10001 | 1986-06-26T00:00:00.000Z | 57305 | 1986-06 | 57305.0 +10002 | 1985-11-21T00:00:00.000Z | 56371 | 1985-11 | 54540.0 +10003 | 1986-08-28T00:00:00.000Z | 61805 | 1986-08 | 52704.0 +10004 | 1986-12-01T00:00:00.000Z | 36174 | 1986-12 | 36174.0 +10005 | 1989-09-12T00:00:00.000Z | 63528 | 1989-09 | 49924.0 +; + +inlineStatsDateFormatDay +required_capability: inline_stats + +FROM employees +| KEEP emp_no, hire_date, salary +| INLINE STATS max_salary = MAX(salary) BY day_hired = DATE_FORMAT("yyyy-MM-dd", hire_date) +| SORT emp_no +| LIMIT 3 +; + +emp_no:integer | hire_date:datetime | salary:integer | max_salary:integer | day_hired:keyword +10001 | 1986-06-26T00:00:00.000Z | 57305 | 57305 | 1986-06-26 +10002 | 1985-11-21T00:00:00.000Z | 56371 | 56371 | 1985-11-21 +10003 | 1986-08-28T00:00:00.000Z | 61805 | 61805 | 1986-08-28 +; + +inlineStatsDateFormatMixed +required_capability: inline_stats + +FROM employees +| KEEP emp_no, hire_date, salary +| INLINE STATS + count = COUNT(*), + avg_salary = AVG(salary) + BY + year_hired = DATE_FORMAT("yyyy", hire_date), + week_hired = DATE_FORMAT("yyyy-w", hire_date) +| EVAL avg_salary = ROUND(avg_salary) +| SORT emp_no +| LIMIT 3 +; + +emp_no:integer | hire_date:datetime | salary:integer | count:long | year_hired:keyword | week_hired:keyword | avg_salary:double +10001 | 1986-06-26T00:00:00.000Z | 57305 | 1 | 1986 | 1986-26 | 57305.0 +10002 | 1985-11-21T00:00:00.000Z | 56371 | 4 | 1985 | 1985-47 | 54540.0 +10003 | 1986-08-28T00:00:00.000Z | 61805 | 1 | 1986 | 1986-35 | 61805.0 +; + +inlineStatsDateFormatWithLiterals +required_capability: inline_stats + +FROM employees +| KEEP emp_no, hire_date +| INLINE STATS count = COUNT(*) BY formatted_date = DATE_FORMAT("'Year:'yyyy'-Month:'MM", hire_date) +| SORT emp_no +| LIMIT 3 +; + +emp_no:integer | hire_date:datetime | count:long | formatted_date:keyword +10001 | 1986-06-26T00:00:00.000Z | 1 | Year:1986-Month:06 +10002 | 1985-11-21T00:00:00.000Z | 4 | Year:1985-Month:11 +10003 | 1986-08-28T00:00:00.000Z | 2 | Year:1986-Month:08 +; + +inlineStatsDateFormatNonOptimizable +required_capability: inline_stats + +FROM employees +| KEEP emp_no, hire_date +| INLINE STATS count = COUNT(*) BY week = DATE_FORMAT("yyyy-w", hire_date) +| SORT emp_no +| LIMIT 3 +; + +emp_no:integer | hire_date:datetime | count:long | week:keyword +10001 | 1986-06-26T00:00:00.000Z | 1 | 1986-26 +10002 | 1985-11-21T00:00:00.000Z | 4 | 1985-47 +10003 | 1986-08-28T00:00:00.000Z | 1 | 1986-35 +; + +inlineStatsDateFormatMultipleOptimizations +required_capability: inline_stats + +FROM employees +| KEEP emp_no, hire_date, salary +| INLINE STATS + count = COUNT(*), + min_salary = MIN(salary), + max_salary = MAX(salary) + BY + year_hired = DATE_FORMAT("yyyy", hire_date), + month_hired = DATE_FORMAT("yyyy-MM", hire_date), + day_hired = DATE_FORMAT("yyyy-MM-dd", hire_date) +| SORT day_hired +| LIMIT 3 +; + +emp_no:integer | hire_date:datetime | salary:integer | count:long | min_salary:integer | max_salary:integer | year_hired:keyword | month_hired:keyword | day_hired:keyword +10009 | 1985-02-18T00:00:00.000Z | 66174 | 1 | 66174 | 66174 | 1985 | 1985-02 | 1985-02-18 +10048 | 1985-02-24T00:00:00.000Z | 26436 | 1 | 26436 | 26436 | 1985 | 1985-02 | 1985-02-24 +10098 | 1985-05-13T00:00:00.000Z | 44817 | 1 | 44817 | 44817 | 1985 | 1985-05 | 1985-05-13 + +; + +inlineStatsDateFormatWithDateNanos +required_capability: inline_stats + +FROM sample_data_ts_nanos +| KEEP @timestamp, event_duration, client_ip +| INLINE STATS + count = COUNT(*), + avg_duration = AVG(event_duration) +BY year_month = DATE_FORMAT("yyyy-MM", @timestamp) +| EVAL avg_duration = ROUND(avg_duration) +| SORT @timestamp +| LIMIT 5 +; + +@timestamp:date_nanos | event_duration:long | client_ip:ip | count:long | year_month:keyword | avg_duration:double +2023-10-23T12:15:03.360123456Z | 3450233 | 172.21.2.162 | 7 | 2023-10 | 3318761.0 +2023-10-23T12:27:28.948123456Z | 2764889 | 172.21.2.113 | 7 | 2023-10 | 3318761.0 +2023-10-23T13:33:34.937123456Z | 1232382 | 172.21.0.5 | 7 | 2023-10 | 3318761.0 +2023-10-23T13:51:54.732123456Z | 725448 | 172.21.3.15 | 7 | 2023-10 | 3318761.0 +2023-10-23T13:52:55.015123456Z | 8268153 | 172.21.3.15 | 7 | 2023-10 | 3318761.0 +; + +inlineStatsDateFormatDateNanosYear +required_capability: inline_stats + +FROM sample_data_ts_nanos +| KEEP @timestamp, event_duration +| INLINE STATS + count = COUNT(*), + min_timestamp = MIN(@timestamp), + max_timestamp = MAX(@timestamp), + total_duration = SUM(event_duration) +BY year = DATE_FORMAT("yyyy", @timestamp) +| SORT @timestamp +| LIMIT 3 +; + +@timestamp:date_nanos | event_duration:long | count:long | min_timestamp:date_nanos | max_timestamp:date_nanos | total_duration:long | year:keyword +2023-10-23T12:15:03.360123456Z | 3450233 | 7 | 2023-10-23T12:15:03.360123456Z | 2023-10-23T13:55:01.543123456Z | 23231327 | 2023 +2023-10-23T12:27:28.948123456Z | 2764889 | 7 | 2023-10-23T12:15:03.360123456Z | 2023-10-23T13:55:01.543123456Z | 23231327 | 2023 +2023-10-23T13:33:34.937123456Z | 1232382 | 7 | 2023-10-23T12:15:03.360123456Z | 2023-10-23T13:55:01.543123456Z | 23231327 | 2023 +; + +inlineStatsDateFormatDateNanosHour +required_capability: inline_stats + +FROM sample_data_ts_nanos +| KEEP @timestamp, event_duration, client_ip +| INLINE STATS + count = COUNT(*), + clients = COUNT(client_ip), + max_duration = MAX(event_duration) +BY hour = DATE_FORMAT("HH", @timestamp) +| SORT @timestamp +| LIMIT 5 +; + +@timestamp:date_nanos | event_duration:long | client_ip:ip | count:long | clients:long | max_duration:long | hour:keyword +2023-10-23T12:15:03.360123456Z | 3450233 | 172.21.2.162 | 2 | 2 | 3450233 | 12 +2023-10-23T12:27:28.948123456Z | 2764889 | 172.21.2.113 | 2 | 2 | 3450233 | 12 +2023-10-23T13:33:34.937123456Z | 1232382 | 172.21.0.5 | 5 | 5 | 8268153 | 13 +2023-10-23T13:51:54.732123456Z | 725448 | 172.21.3.15 | 5 | 5 | 8268153 | 13 +2023-10-23T13:52:55.015123456Z | 8268153 | 172.21.3.15 | 5 | 5 | 8268153 | 13 +; + +inlineStatsDateFormatDateNanosMultipleFormats +required_capability: inline_stats + +FROM sample_data_ts_nanos +| KEEP @timestamp, event_duration +| INLINE STATS + count = COUNT(*), + avg_duration = AVG(event_duration) +BY + year = DATE_FORMAT("yyyy", @timestamp), + month = DATE_FORMAT("MM", @timestamp), + day = DATE_FORMAT("dd", @timestamp), + hour = DATE_FORMAT("HH", @timestamp) +| EVAL avg_duration = ROUND(avg_duration) +| SORT @timestamp +| LIMIT 5 +; + +@timestamp:date_nanos | event_duration:long | count:long | year:keyword | month:keyword | day:keyword | hour:keyword | avg_duration:double +2023-10-23T12:15:03.360123456Z | 3450233 | 2 | 2023 | 10 | 23 | 12 | 3107561.0 +2023-10-23T12:27:28.948123456Z | 2764889 | 2 | 2023 | 10 | 23 | 12 | 3107561.0 +2023-10-23T13:33:34.937123456Z | 1232382 | 5 | 2023 | 10 | 23 | 13 | 3403241.0 +2023-10-23T13:51:54.732123456Z | 725448 | 5 | 2023 | 10 | 23 | 13 | 3403241.0 +2023-10-23T13:52:55.015123456Z | 8268153 | 5 | 2023 | 10 | 23 | 13 | 3403241.0 +; + +inlineStatsDateFormatDateNanosWithFilter +required_capability: inline_stats + +FROM sample_data_ts_nanos +| WHERE event_duration > 2000000 +| KEEP @timestamp, event_duration, client_ip +| INLINE STATS + count = COUNT(*), + min_duration = MIN(event_duration), + max_duration = MAX(event_duration) +BY year = DATE_FORMAT("yyyy", @timestamp) +| SORT @timestamp +| LIMIT 3 +; + +@timestamp:date_nanos | event_duration:long | client_ip:ip | count:long | min_duration:long | max_duration:long | year:keyword +2023-10-23T12:15:03.360123456Z | 3450233 | 172.21.2.162 | 4 | 2764889 | 8268153 | 2023 +2023-10-23T12:27:28.948123456Z | 2764889 | 172.21.2.113 | 4 | 2764889 | 8268153 | 2023 +2023-10-23T13:52:55.015123456Z | 8268153 | 172.21.3.15 | 4 | 2764889 | 8268153 | 2023 +; + +inlineStatsDateFormatDateNanosComplexPattern +required_capability: inline_stats + +FROM sample_data_ts_nanos +| KEEP @timestamp, event_duration, client_ip +| INLINE STATS + count = COUNT(*), + clients = COUNT(client_ip) +BY formatted_time = DATE_FORMAT("yyyy-MM-dd HH:mm", @timestamp) +| SORT @timestamp +| LIMIT 5 +; + +@timestamp:date_nanos | event_duration:long | client_ip:ip | count:long | clients:long | formatted_time:keyword +2023-10-23T12:15:03.360123456Z | 3450233 | 172.21.2.162 | 1 | 1 | 2023-10-23 12:15 +2023-10-23T12:27:28.948123456Z | 2764889 | 172.21.2.113 | 1 | 1 | 2023-10-23 12:27 +2023-10-23T13:33:34.937123456Z | 1232382 | 172.21.0.5 | 1 | 1 | 2023-10-23 13:33 +2023-10-23T13:51:54.732123456Z | 725448 | 172.21.3.15 | 1 | 1 | 2023-10-23 13:51 +2023-10-23T13:52:55.015123456Z | 8268153 | 172.21.3.15 | 1 | 1 | 2023-10-23 13:52 +; + +inlineStatsDateFormatDateNanosWithNullHandling +required_capability: inline_stats + +FROM sample_data_ts_nanos +| KEEP @timestamp, event_duration, client_ip +| EVAL filtered_timestamp = CASE(event_duration > 5000000, @timestamp, null) +| INLINE STATS + count_all = COUNT(*), + count_filtered = COUNT(filtered_timestamp), + avg_duration = AVG(event_duration) +BY year = DATE_FORMAT("yyyy", COALESCE(filtered_timestamp, @timestamp)) +| EVAL avg_duration = ROUND(avg_duration) +| SORT @timestamp +| LIMIT 5 +; + +@timestamp:date_nanos | event_duration:long | client_ip:ip | filtered_timestamp:date_nanos | count_all:long | count_filtered:long | year:keyword | avg_duration:double +2023-10-23T12:15:03.360123456Z | 3450233 | 172.21.2.162 | null | 7 | 2 | 2023 | 3318761.0 +2023-10-23T12:27:28.948123456Z | 2764889 | 172.21.2.113 | null | 7 | 2 | 2023 | 3318761.0 +2023-10-23T13:33:34.937123456Z | 1232382 | 172.21.0.5 | null | 7 | 2 | 2023 | 3318761.0 +2023-10-23T13:51:54.732123456Z | 725448 | 172.21.3.15 | null | 7 | 2 | 2023 | 3318761.0 +2023-10-23T13:52:55.015123456Z | 8268153 | 172.21.3.15 | 2023-10-23T13:52:55.015123456Z | 7 | 2 | 2023 | 3318761.0 +; + +inlineStatsDateFormatOptimizationsWithConcat +required_capability: inline_stats + +FROM employees +| KEEP emp_no, hire_date +| EVAL format = CONCAT("yyyy", "-MM") +| INLINE STATS count = COUNT(*) BY year_month_hired = DATE_FORMAT(format, hire_date) +| SORT emp_no +| LIMIT 5 +; + +emp_no:integer | hire_date:datetime | format:keyword | count:long | year_month_hired:keyword +10001 | 1986-06-26T00:00:00.000Z | yyyy-MM | 1 | 1986-06 +10002 | 1985-11-21T00:00:00.000Z | yyyy-MM | 4 | 1985-11 +10003 | 1986-08-28T00:00:00.000Z | yyyy-MM | 2 | 1986-08 +10004 | 1986-12-01T00:00:00.000Z | yyyy-MM | 1 | 1986-12 +10005 | 1989-09-12T00:00:00.000Z | yyyy-MM | 4 | 1989-09 +; + + diff --git a/x-pack/plugin/esql/qa/testFixtures/src/main/resources/stats.csv-spec b/x-pack/plugin/esql/qa/testFixtures/src/main/resources/stats.csv-spec index dc7607bda6934..30c5dea67e4c8 100644 --- a/x-pack/plugin/esql/qa/testFixtures/src/main/resources/stats.csv-spec +++ b/x-pack/plugin/esql/qa/testFixtures/src/main/resources/stats.csv-spec @@ -3436,6 +3436,545 @@ VALUES(color):keyword | color:keyword // end::mv-group-values-expand-result[] ; + +dateFormatYearOptimization +FROM employees +| STATS count = COUNT(*) BY year_hired = DATE_FORMAT("yyyy", hire_date) +| SORT year_hired +| LIMIT 5 +; + +count:long | year_hired:keyword +11 | 1985 +11 | 1986 +15 | 1987 +9 | 1988 +13 | 1989 +; + +dateFormatMonthOptimization +FROM employees +| STATS avg_salary = AVG(salary) BY month_hired = DATE_FORMAT("yyyy-MM", hire_date) +| EVAL avg_salary = ROUND(avg_salary) +| SORT month_hired +| LIMIT 5 +; + +month_hired:keyword | avg_salary:double +1985-02 | 46305.0 +1985-05 | 44817.0 +1985-07 | 62405.0 +1985-09 | 49095.0 +1985-10 | 51532.0 +; + +dateFormatDayOptimization +FROM employees +| STATS max_salary = MAX(salary) BY day_hired = DATE_FORMAT("yyyy-MM-dd", hire_date) +| SORT day_hired +| LIMIT 3 +; + +max_salary:integer | day_hired:keyword +66174 | 1985-02-18 +26436 | 1985-02-24 +44817 | 1985-05-13 +; + +dateFormatHourOptimization +FROM employees +| EVAL hire_datetime = hire_date +| STATS count = COUNT(*) BY hour_hired = DATE_FORMAT("yyyy-MM-dd HH", hire_datetime) +| SORT hour_hired +| LIMIT 3 +; + +count:long | hour_hired:keyword +1 | 1985-02-18 00 +1 | 1985-02-24 00 +1 | 1985-05-13 00 +; + +dateFormatMinuteOptimization +FROM employees +| EVAL hire_datetime = hire_date +| STATS count = COUNT(*) BY minute_hired = DATE_FORMAT("yyyy-MM-dd HH:mm", hire_datetime) +| SORT minute_hired +| LIMIT 3 +; + +count:long | minute_hired:keyword +1 | 1985-02-18 00:00 +1 | 1985-02-24 00:00 +1 | 1985-05-13 00:00 +; + +dateFormatSecondOptimization +FROM employees +| EVAL hire_datetime = hire_date +| STATS count = COUNT(*) BY second_hired = DATE_FORMAT("yyyy-MM-dd HH:mm:ss", hire_datetime) +| SORT second_hired +| LIMIT 3 +; + +count:long | second_hired:keyword +1 | 1985-02-18 00:00:00 +1 | 1985-02-24 00:00:00 +1 | 1985-05-13 00:00:00 +; + +dateFormatComplexOptimization +FROM employees +| STATS count = COUNT(*), avg_salary = AVG(salary) BY + year_month = DATE_FORMAT("yyyy-MM", hire_date), + day_of_year = DATE_FORMAT("yyyy-D", hire_date) +| EVAL avg_salary = ROUND(avg_salary) +| SORT year_month, day_of_year +| LIMIT 5 +; + +count:long | year_month:keyword | day_of_year:keyword | avg_salary:double +1 | 1985-02 | 1985-49 | 66174.0 +1 | 1985-02 | 1985-55 | 26436.0 +1 | 1985-05 | 1985-133 | 44817.0 +1 | 1985-07 | 1985-190 | 62405.0 +1 | 1985-09 | 1985-260 | 49095.0 +; + +dateFormatWithLiterals +FROM employees +| STATS count = COUNT(*) BY formatted_date = DATE_FORMAT("'Year:'yyyy'-Month:'MM", hire_date) +| SORT formatted_date +| LIMIT 5 +; + +count:long | formatted_date:keyword +2 | Year:1985-Month:02 +1 | Year:1985-Month:05 +1 | Year:1985-Month:07 +1 | Year:1985-Month:09 +2 | Year:1985-Month:10 +; + +dateFormatNonOptimizable +FROM employees +| STATS count = COUNT(*) BY quarter = DATE_FORMAT("yyyy-Q-d", hire_date) +| SORT quarter +| LIMIT 4 +; + +count:long | quarter:keyword +1 | 1985-1-18 +1 | 1985-1-24 +1 | 1985-2-13 +1 | 1985-3-17 +; + +dateFormatMixedOptimization +FROM employees +| STATS + count = COUNT(*), + avg_salary = AVG(salary) +BY + year_optimizable = DATE_FORMAT("yyyy", hire_date), + quarter_not_optimizable = DATE_FORMAT("yyyy-Q-d", hire_date), + month_optimizable = DATE_FORMAT("MM", hire_date) +| EVAL avg_salary = ROUND(avg_salary) +| SORT year_optimizable, quarter_not_optimizable, month_optimizable +| LIMIT 5 +; + +count:long | year_optimizable:keyword | quarter_not_optimizable:keyword | month_optimizable:keyword | avg_salary:double +1 | 1985 | 1985-1-18 | 02 | 66174.0 +1 | 1985 | 1985-1-24 | 02 | 26436.0 +1 | 1985 | 1985-2-13 | 05 | 44817.0 +1 | 1985 | 1985-3-17 | 09 | 49095.0 +1 | 1985 | 1985-3-9 | 07 | 62405.0 +; + +dateFormatDifferentSeparators +FROM employees +| STATS count = COUNT(*) BY + slash_format = DATE_FORMAT("yyyy/MM/dd", hire_date), + dot_format = DATE_FORMAT("yyyy.MM.dd", hire_date) +| SORT slash_format +| LIMIT 3 +; + +count:long | slash_format:keyword | dot_format:keyword +1 | 1985/02/18 | 1985.02.18 +1 | 1985/02/24 | 1985.02.24 +1 | 1985/05/13 | 1985.05.13 +; + +dateFormatTextualMonths +FROM employees +| STATS count = COUNT(*) BY month_name = DATE_FORMAT("yyyy-MMM", hire_date) +| SORT month_name +| LIMIT 5 +; + +count:long | month_name:keyword +2 | 1985-Feb +1 | 1985-Jul +1 | 1985-May +4 | 1985-Nov +2 | 1985-Oct +; + +dateFormatISOFormat +FROM employees +| STATS count = COUNT(*) BY iso_date = DATE_FORMAT("yyyy-MM-dd'T'HH:mm:ss", hire_date) +| SORT iso_date +| LIMIT 3 +; + +count:long | iso_date:keyword +1 | 1985-02-18T00:00:00 +1 | 1985-02-24T00:00:00 +1 | 1985-05-13T00:00:00 +; + +dateFormatWithDateNanos +FROM sample_data_ts_nanos +| STATS count = COUNT(*) BY year_month = DATE_FORMAT("yyyy-MM", @timestamp) +| SORT year_month +; + +count:long | year_month:keyword +7 | 2023-10 +; + +dateFormatDateNanosYear +FROM sample_data_ts_nanos +| STATS + count = COUNT(*), + min_timestamp = MIN(@timestamp), + max_timestamp = MAX(@timestamp) +BY year = DATE_FORMAT("yyyy", @timestamp) +| SORT year +; + +count:long | min_timestamp:date_nanos | max_timestamp:date_nanos | year:keyword +7 | 2023-10-23T12:15:03.360123456Z | 2023-10-23T13:55:01.543123456Z | 2023 +; + +dateFormatDateNanosMonth +FROM sample_data_ts_nanos +| STATS + count = COUNT(*), + avg_duration = AVG(event_duration) +BY month = DATE_FORMAT("MM", @timestamp) +| EVAL avg_duration = ROUND(avg_duration) +| SORT month +; + +count:long | month:keyword | avg_duration:double +7 | 10 | 3318761.0 +; + +dateFormatDateNanosDay +FROM sample_data_ts_nanos +| STATS count = COUNT(*) BY day = DATE_FORMAT("dd", @timestamp) +| SORT day +; + +count:long | day:keyword +7 | 23 +; + +dateFormatDateNanosHour +FROM sample_data_ts_nanos +| STATS count = COUNT(*) BY hour = DATE_FORMAT("HH", @timestamp) +| SORT hour +; + +count:long | hour:keyword +2 | 12 +5 | 13 +; + +dateFormatDateNanosYearMonth +FROM sample_data_ts_nanos +| STATS + count = COUNT(*), + clients = COUNT(client_ip) +BY year_month = DATE_FORMAT("yyyy-MM", @timestamp) +| SORT year_month +; + +count:long | clients:long | year_month:keyword +7 | 7 | 2023-10 +; + +dateFormatDateNanosMultipleFormats +FROM sample_data_ts_nanos +| STATS + count = COUNT(*), + total_duration = SUM(event_duration) +BY + year = DATE_FORMAT("yyyy", @timestamp), + month = DATE_FORMAT("MM", @timestamp), + day = DATE_FORMAT("dd", @timestamp) +| SORT year, month, day +; + +count:long | total_duration:long | year:keyword | month:keyword | day:keyword +7 | 23231327 | 2023 | 10 | 23 +; + +dateFormatDateNanosWithWhere +FROM sample_data_ts_nanos +| WHERE @timestamp >= "2023-10-23T13:00:00Z" +| STATS count = COUNT(*) BY year = DATE_FORMAT("yyyy", @timestamp) +| SORT year +; + +count:long | year:keyword +5 | 2023 +; + +dateFormatDateNanosComplexPattern +FROM sample_data_ts_nanos +| STATS count = COUNT(*) BY formatted = DATE_FORMAT("yyyy-MM-dd HH:mm", @timestamp) +| SORT formatted +| LIMIT 5 +; + +count:long | formatted:keyword +1 | 2023-10-23 12:15 +1 | 2023-10-23 12:27 +1 | 2023-10-23 13:33 +1 | 2023-10-23 13:51 +1 | 2023-10-23 13:52 +; + +dateFormatDateNanosWithNullHandling +FROM sample_data_ts_nanos +| EVAL modified_timestamp = CASE(event_duration > 5000000, @timestamp, null) +| STATS + count_all = COUNT(*), + count_non_null = COUNT(modified_timestamp) +BY month = DATE_FORMAT("yyyy-MM", COALESCE(modified_timestamp, @timestamp)) +| SORT month +; + +count_all:long | count_non_null:long | month:keyword +7 | 2 | 2023-10 +; + + +dateFormatYearOptimization +FROM employees +| STATS count = COUNT(*) BY year_hired = DATE_FORMAT("yyyy", hire_date) +| SORT year_hired +| LIMIT 5 +; + +count:long | year_hired:keyword +11 | 1985 +11 | 1986 +15 | 1987 +9 | 1988 +13 | 1989 +; + +dateFormatMonthOptimization +FROM employees +| STATS avg_salary = AVG(salary) BY month_hired = DATE_FORMAT("yyyy-MM", hire_date) +| EVAL avg_salary = ROUND(avg_salary) +| SORT month_hired +| LIMIT 5 +; + +month_hired:keyword | avg_salary:double +1985-02 | 46305.0 +1985-05 | 44817.0 +1985-07 | 62405.0 +1985-09 | 49095.0 +1985-10 | 51532.0 +; + +dateFormatDayOptimization +FROM employees +| STATS max_salary = MAX(salary) BY day_hired = DATE_FORMAT("yyyy-MM-dd", hire_date) +| SORT day_hired +| LIMIT 3 +; + +max_salary:integer | day_hired:keyword +66174 | 1985-02-18 +26436 | 1985-02-24 +44817 | 1985-05-13 +; + +dateFormatHourOptimization +FROM employees +| EVAL hire_datetime = hire_date +| STATS count = COUNT(*) BY hour_hired = DATE_FORMAT("yyyy-MM-dd HH", hire_datetime) +| SORT hour_hired +| LIMIT 3 +; + +count:long | hour_hired:keyword +1 | 1985-02-18 00 +1 | 1985-02-24 00 +1 | 1985-05-13 00 +; + +dateFormatMinuteOptimization +FROM employees +| EVAL hire_datetime = hire_date +| STATS count = COUNT(*) BY minute_hired = DATE_FORMAT("yyyy-MM-dd HH:mm", hire_datetime) +| SORT minute_hired +| LIMIT 3 +; + +count:long | minute_hired:keyword +1 | 1985-02-18 00:00 +1 | 1985-02-24 00:00 +1 | 1985-05-13 00:00 +; + +dateFormatSecondOptimization +FROM employees +| EVAL hire_datetime = hire_date +| STATS count = COUNT(*) BY second_hired = DATE_FORMAT("yyyy-MM-dd HH:mm:ss", hire_datetime) +| SORT second_hired +| LIMIT 3 +; + +count:long | second_hired:keyword +1 | 1985-02-18 00:00:00 +1 | 1985-02-24 00:00:00 +1 | 1985-05-13 00:00:00 +; + +dateFormatComplexOptimization +FROM employees +| STATS count = COUNT(*), avg_salary = AVG(salary) BY + year_month = DATE_FORMAT("yyyy-MM", hire_date), + day_of_year = DATE_FORMAT("yyyy-D", hire_date) +| EVAL avg_salary = ROUND(avg_salary) +| SORT year_month, day_of_year +| LIMIT 5 +; + +count:long | year_month:keyword | day_of_year:keyword | avg_salary:double +1 | 1985-02 | 1985-49 | 66174.0 +1 | 1985-02 | 1985-55 | 26436.0 +1 | 1985-05 | 1985-133 | 44817.0 +1 | 1985-07 | 1985-190 | 62405.0 +1 | 1985-09 | 1985-260 | 49095.0 +; + +dateFormatWithLiterals +FROM employees +| STATS count = COUNT(*) BY formatted_date = DATE_FORMAT("'Year:'yyyy'-Month:'MM", hire_date) +| SORT formatted_date +| LIMIT 5 +; + +count:long | formatted_date:keyword +2 | Year:1985-Month:02 +1 | Year:1985-Month:05 +1 | Year:1985-Month:07 +1 | Year:1985-Month:09 +2 | Year:1985-Month:10 +; + +dateFormatNonOptimizable +FROM employees +| STATS count = COUNT(*) BY quarter = DATE_FORMAT("yyyy-Q-d", hire_date) +| SORT quarter +| LIMIT 4 +; + +count:long | quarter:keyword +1 | 1985-1-18 +1 | 1985-1-24 +1 | 1985-2-13 +1 | 1985-3-17 +; + +dateFormatMixedOptimization +FROM employees +| STATS + count = COUNT(*), + avg_salary = AVG(salary) +BY + year_optimizable = DATE_FORMAT("yyyy", hire_date), + quarter_not_optimizable = DATE_FORMAT("yyyy-Q-d", hire_date), + month_optimizable = DATE_FORMAT("MM", hire_date) +| EVAL avg_salary = ROUND(avg_salary) +| SORT year_optimizable, quarter_not_optimizable, month_optimizable +| LIMIT 5 +; + +count:long | year_optimizable:keyword | quarter_not_optimizable:keyword | month_optimizable:keyword | avg_salary:double +1 | 1985 | 1985-1-18 | 02 | 66174.0 +1 | 1985 | 1985-1-24 | 02 | 26436.0 +1 | 1985 | 1985-2-13 | 05 | 44817.0 +1 | 1985 | 1985-3-17 | 09 | 49095.0 +1 | 1985 | 1985-3-9 | 07 | 62405.0 +; + +dateFormatDifferentSeparators +FROM employees +| STATS count = COUNT(*) BY + slash_format = DATE_FORMAT("yyyy/MM/dd", hire_date), + dot_format = DATE_FORMAT("yyyy.MM.dd", hire_date) +| SORT slash_format +| LIMIT 3 +; + +count:long | slash_format:keyword | dot_format:keyword +1 | 1985/02/18 | 1985.02.18 +1 | 1985/02/24 | 1985.02.24 +1 | 1985/05/13 | 1985.05.13 +; + +dateFormatTextualMonths +FROM employees +| STATS count = COUNT(*) BY month_name = DATE_FORMAT("yyyy-MMM", hire_date) +| SORT month_name +| LIMIT 5 +; + +count:long | month_name:keyword +2 | 1985-Feb +1 | 1985-Jul +1 | 1985-May +4 | 1985-Nov +2 | 1985-Oct +; + +dateFormatISOFormat +FROM employees +| STATS count = COUNT(*) BY iso_date = DATE_FORMAT("yyyy-MM-dd'T'HH:mm:ss", hire_date) +| SORT iso_date +| LIMIT 3 +; + +count:long | iso_date:keyword +1 | 1985-02-18T00:00:00 +1 | 1985-02-24T00:00:00 +1 | 1985-05-13T00:00:00 +; + + +dateFormatWithConcat +FROM sample_data_ts_nanos +| EVAL format = CONCAT("yyyy", "-MM") +| STATS count = COUNT(*) BY year_month = DATE_FORMAT(format, @timestamp) +| SORT year_month +; + +count:long | year_month:keyword +7 | 2023-10 +; + + filterOrdinalValues required_capability: fix_filter_ordinals from employees diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/date/DateFormat.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/date/DateFormat.java index d30e99794a44e..5b039e08e2c28 100644 --- a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/date/DateFormat.java +++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/date/DateFormat.java @@ -94,11 +94,11 @@ public String getWriteableName() { return ENTRY.name; } - Expression field() { + public Expression field() { return field; } - Expression format() { + public Expression format() { return format; } diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/optimizer/rules/logical/PropagateInlineEvals.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/optimizer/rules/logical/PropagateInlineEvals.java index f518322c4f924..4c8c03a7c7e82 100644 --- a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/optimizer/rules/logical/PropagateInlineEvals.java +++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/optimizer/rules/logical/PropagateInlineEvals.java @@ -8,16 +8,21 @@ package org.elasticsearch.xpack.esql.optimizer.rules.logical; import org.elasticsearch.xpack.esql.core.expression.Alias; +import org.elasticsearch.xpack.esql.core.expression.Attribute; +import org.elasticsearch.xpack.esql.core.expression.AttributeSet; import org.elasticsearch.xpack.esql.core.expression.Expression; +import org.elasticsearch.xpack.esql.core.expression.NamedExpression; import org.elasticsearch.xpack.esql.core.expression.ReferenceAttribute; +import org.elasticsearch.xpack.esql.expression.function.scalar.date.DateFormat; import org.elasticsearch.xpack.esql.plan.logical.Aggregate; import org.elasticsearch.xpack.esql.plan.logical.Eval; import org.elasticsearch.xpack.esql.plan.logical.LogicalPlan; +import org.elasticsearch.xpack.esql.plan.logical.Project; import org.elasticsearch.xpack.esql.plan.logical.join.InlineJoin; import org.elasticsearch.xpack.esql.plan.logical.join.StubRelation; import java.util.ArrayList; -import java.util.LinkedHashMap; +import java.util.HashMap; import java.util.List; import java.util.Map; @@ -42,8 +47,7 @@ protected LogicalPlan rule(InlineJoin plan) { // grouping references List groupingAlias = new ArrayList<>(); - // TODO: replace this with AttributeSet - Map groupingRefs = new LinkedHashMap<>(); + AttributeSet.Builder groupingRefs = AttributeSet.builder(); // perform only one iteration that does two things // first checks any aggregate that declares expressions inside the grouping @@ -53,7 +57,7 @@ protected LogicalPlan rule(InlineJoin plan) { // collect references for (Expression g : aggregate.groupings()) { if (g instanceof ReferenceAttribute ref) { - groupingRefs.put(ref.name(), ref); + groupingRefs.add(ref); } } } @@ -68,7 +72,7 @@ protected LogicalPlan rule(InlineJoin plan) { List remainingEvals = new ArrayList<>(fields.size()); for (Alias f : fields) { // TODO: look into identifying refs by their NameIds instead - if (groupingRefs.remove(f.name()) != null) { + if (groupingRefs.remove(f.toAttribute())) { groupingAlias.add(f); } else { remainingEvals.add(f); @@ -86,6 +90,92 @@ protected LogicalPlan rule(InlineJoin plan) { if (groupingAlias.size() > 0) { left = new Eval(plan.source(), plan.left(), groupingAlias); } + + /* + * Handle DATE_FORMAT optimization pattern matching: + * + * BEFORE: The ReplaceAggregateNestedExpressionWithEval rule may have transformed a plan like: + * InlineJoin[LEFT,[month{r$}#11],[month{r$}#11]] + * |_EsRelation[test_index][nanos{f}#8, timestamp{f}#10, value{f}#9] + * \_Project[[my_sum{r}#3, month{r}#6]] + * \_Eval[[DATEFORMAT(yyyy-MM[KEYWORD],month{r$}#11) AS month#6]] + * \_Aggregate[[month{r$}#11],[SUM(value{f}#9,true[BOOLEAN],compensated[KEYWORD]) AS my_sum#3, month{r$}#11]] + * \_Eval[[DATETRUNC(P1M[DATE_PERIOD],timestamp{f}#10) AS month#11]] + * \_StubRelation[[nanos{f}#8, timestamp{f}#10, value{f}#9]] + * + * AFTER: We want to optimize this to: + * Project[[nanos{f}#8, timestamp{f}#10, value{f}#9, my_sum{r}#3, month{r}#6]] + * \_Eval[[DATEFORMAT(yyyy-MM[KEYWORD],month{r$}#11) AS month#6]] + * \_InlineJoin[LEFT,[month{r$}#11],[month{r$}#11]] + * |_Eval[[DATETRUNC(P1M[DATE_PERIOD],timestamp{f}#10) AS month#11]] + * | \_EsRelation[test_index][nanos{f}#8, timestamp{f}#10, value{f}#9] + * \_Aggregate[[month{r$}#11],[SUM(value{f}#9,true[BOOLEAN],compensated[KEYWORD]) AS my_sum#3, month{r$}#11]] + * \_StubRelation[[nanos{f}#8, timestamp{f}#10, value{f}#9, month{r$}#11]] + * + * Due to the optimization in ReplaceAggregateNestedExpressionWithEval for GROUP BY DATE_FORMAT, the fields on the right-side of + * the join may change. Therefore, we need to move the newly generated Project and Eval nodes outside the join, place the Aggregate + * node on the left-side of the join, and finally update the join fields to use those produced by the Aggregate, ensuring that both + * sides join on the correct fields. + */ + List leftFields = plan.config().leftFields(); + List rightFields = plan.config().rightFields(); + + // Find attributes that were shadowed by those generated by ReplaceAggregateNestedExpressionWithEval + AttributeSet.Builder shadowing = AttributeSet.builder(); + for (Alias a : groupingAlias) { + if (rightFields.contains(a.toAttribute()) == false) { + shadowing.add(a.toAttribute()); + } + } + + // Check if we have the expected pattern: Project -> Eval -> Aggregate + if (shadowing.isEmpty() == false && right instanceof Project project) { + AttributeSet.Builder builder = AttributeSet.builder(); + builder.addAll(project.output()); + AttributeSet projectOutput = builder.build(); + + // Verify the Project contains an Eval and covers all left fields needed for the join + if (project.child() instanceof Eval eval && projectOutput.containsAll(leftFields)) { + Map replacements = new HashMap<>(); + + // Look for DATE_FORMAT expressions that can be moved outside the join + for (Alias f : eval.fields()) { + if (f.child() instanceof DateFormat df && shadowing.remove(df.field())) { + Attribute original = ((NamedExpression) df.field()).toAttribute(); // The original datetime field + Attribute aliasAttr = f.toAttribute(); // The DATE_FORMAT alias + replacements.put(original, aliasAttr); + + // Update join configuration to use original field instead of alias + int rIndex = rightFields.indexOf(aliasAttr); + if (rIndex >= 0) rightFields.set(rIndex, original); + + int lIndex = leftFields.indexOf(aliasAttr); + if (lIndex >= 0) leftFields.set(lIndex, original); + } + } + + // If we successfully handled all shadowing attributes, restructure the plan + if (shadowing.isEmpty()) { + // Remove the Eval from the right side (it will be moved outside) + right = eval.child(); + + // Update the final output to use the correct attribute references + List output = plan.output(); + output.replaceAll(attr -> replacements.getOrDefault(attr, attr)); + + // Create the new join plan with original fields + LogicalPlan join = plan.replaceChildren( + left, + InlineJoin.replaceStub(new StubRelation(right.source(), left.output()), right) + ); + + // Wrap the join with the DATE_FORMAT evaluation + LogicalPlan evalPlan = eval.replaceChild(join); + return new Project(evalPlan.source(), evalPlan, output); + } + } + } + // replace the old stub with the new out to capture the new output return plan.replaceChildren(left, replaceStub(new StubRelation(right.source(), computeOutput(right, left)), right)); } diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/optimizer/rules/logical/ReplaceAggregateNestedExpressionWithEval.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/optimizer/rules/logical/ReplaceAggregateNestedExpressionWithEval.java index 830220d3f8dbd..0556b3b40684d 100644 --- a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/optimizer/rules/logical/ReplaceAggregateNestedExpressionWithEval.java +++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/optimizer/rules/logical/ReplaceAggregateNestedExpressionWithEval.java @@ -7,25 +7,48 @@ package org.elasticsearch.xpack.esql.optimizer.rules.logical; +import org.elasticsearch.common.lucene.BytesRefs; import org.elasticsearch.xpack.esql.core.expression.Alias; import org.elasticsearch.xpack.esql.core.expression.Attribute; +import org.elasticsearch.xpack.esql.core.expression.AttributeMap; import org.elasticsearch.xpack.esql.core.expression.Expression; +import org.elasticsearch.xpack.esql.core.expression.Literal; import org.elasticsearch.xpack.esql.core.expression.MapExpression; import org.elasticsearch.xpack.esql.core.expression.NamedExpression; +import org.elasticsearch.xpack.esql.core.expression.ReferenceAttribute; +import org.elasticsearch.xpack.esql.core.tree.Source; +import org.elasticsearch.xpack.esql.core.type.DataType; import org.elasticsearch.xpack.esql.core.util.Holder; import org.elasticsearch.xpack.esql.expression.function.aggregate.AggregateFunction; import org.elasticsearch.xpack.esql.expression.function.grouping.GroupingFunction; +import org.elasticsearch.xpack.esql.expression.function.scalar.date.DateFormat; +import org.elasticsearch.xpack.esql.expression.function.scalar.date.DateTrunc; +import org.elasticsearch.xpack.esql.optimizer.LogicalOptimizerContext; +import org.elasticsearch.xpack.esql.optimizer.rules.RuleUtils; import org.elasticsearch.xpack.esql.plan.logical.Aggregate; import org.elasticsearch.xpack.esql.plan.logical.Eval; import org.elasticsearch.xpack.esql.plan.logical.LogicalPlan; +import org.elasticsearch.xpack.esql.plan.logical.Project; +import org.elasticsearch.xpack.esql.type.EsqlDataTypeConverter; +import java.time.Period; +import java.time.format.DateTimeFormatter; +import java.time.temporal.ChronoField; +import java.time.temporal.ChronoUnit; +import java.time.temporal.IsoFields; +import java.time.temporal.JulianFields; import java.util.ArrayList; import java.util.HashMap; import java.util.List; +import java.util.Locale; import java.util.Map; /** - * Replace nested expressions inside a {@link Aggregate} with synthetic eval. + * An optimizer rule that performs two main optimizations: + * 1. Replaces nested expressions inside a {@link Aggregate} with synthetic eval + * 2. Optimizes DATE_FORMAT function calls in GROUP BY clauses with more efficient DATE_TRUNC operations + *

+ * For nested expressions in aggregates: * {@code STATS SUM(a + 1) BY x % 2} * becomes * {@code EVAL `a + 1` = a + 1, `x % 2` = x % 2 | STATS SUM(`a+1`_ref) BY `x % 2`_ref} @@ -33,15 +56,38 @@ * {@code INLINE STATS SUM(a + 1) BY x % 2} * becomes * {@code EVAL `a + 1` = a + 1, `x % 2` = x % 2 | INLINE STATS SUM(`a+1`_ref) BY `x % 2`_ref} + *

+ * For date formatting optimization: + * {@code STATS sum = SUM(value) BY month = DATE_FORMAT("yyyy-MM", timestamp) } + * can be optimized to + * {@code STATS sum = SUM(value) BY month1 = DATE_TRUNC(1 month, timestamp) | EVAL month = DATE_FORMAT("yyyy-MM", month1) | KEEP sum, month} + * which is more efficient for grouping operations. + *

+ * The date formatting optimization analyzes the format pattern and maps it to the smallest possible time interval + * that preserves the grouping semantics. Supported intervals range from nanoseconds to years, including special + * cases like quarters and weeks. + *

+ * This date optimization not only improves performance but also ensures correctness in time-based grouping: + * DATE_TRUNC properly handles timezone and daylight saving time (DST) transitions when using Period or Duration + * intervals, while DATE_FORMAT does not account for these timezone-related considerations. */ -public final class ReplaceAggregateNestedExpressionWithEval extends OptimizerRules.OptimizerRule { +public final class ReplaceAggregateNestedExpressionWithEval extends OptimizerRules.ParameterizedOptimizerRule< + Aggregate, + LogicalOptimizerContext> { + + public ReplaceAggregateNestedExpressionWithEval() { + super(OptimizerRules.TransformDirection.DOWN); + } @Override - protected LogicalPlan rule(Aggregate aggregate) { - List evals = new ArrayList<>(); + protected LogicalPlan rule(Aggregate aggregate, LogicalOptimizerContext ctx) { + List evalsBeforeAgg = new ArrayList<>(); + List evalsAfterAgg = new ArrayList<>(); Map evalNames = new HashMap<>(); Map groupingAttributes = new HashMap<>(); List newGroupings = new ArrayList<>(aggregate.groupings()); + List newProjections = new ArrayList<>(); + Map referenceAttributes = new HashMap<>(); boolean groupingChanged = false; // start with the groupings since the aggs might reuse/reference them @@ -52,7 +98,7 @@ protected LogicalPlan rule(Aggregate aggregate) { // for non-evaluable grouping functions, replace their nested expressions with attributes and extract the expression out // into an eval (added later below) if (asChild instanceof GroupingFunction.NonEvaluatableGroupingFunction gf) { - Expression newGroupingFunction = transformNonEvaluatableGroupingFunction(gf, evals); + Expression newGroupingFunction = transformNonEvaluatableGroupingFunction(gf, evalsBeforeAgg); if (newGroupingFunction != gf) { groupingChanged = true; newGroupings.set(i, as.replaceChild(newGroupingFunction)); @@ -61,7 +107,41 @@ protected LogicalPlan rule(Aggregate aggregate) { // Move the alias into an eval and replace it with its attribute. groupingChanged = true; var attr = as.toAttribute(); - evals.add(as); + final Attribute finalAttribute = as.toAttribute(); + if (asChild instanceof DateFormat df + && aggregate.aggregates() + .stream() + .anyMatch( + expression -> expression instanceof Alias alias && alias.references().contains(finalAttribute) + ) == false) { + // Extract the format pattern and field from DateFormat + Expression rawFormat = df.format(); + AttributeMap collectRefs = RuleUtils.foldableReferences(aggregate, ctx); + + // Try to convert the format pattern to a minimal time interval + // This optimization attempts to simplify date formatting to DATE_TRUNC operations + if (collectRefs.resolve(rawFormat, rawFormat) instanceof Literal format) { + Literal interval = inferTruncIntervalFromFormat(BytesRefs.toString(format.value()), g.source()); + // If we can optimize the format to use DATE_TRUNC + if (interval != null) { + // Create a new DateTrunc operation with the optimized interval + DateTrunc dateTrunc = new DateTrunc(df.source(), interval, df.field()); + // Create a synthetic alias for the DateTrunc operation + var alias = new Alias(as.source(), as.name(), dateTrunc, null, true); + attr = alias.toAttribute(); + // Replace the original DateFormat children with the new format and attribute + Expression expression = df.replaceChildren(List.of(format, attr)); + // Create a new eval alias for the optimized expression + Alias newEval = as.replaceChild(expression); + evalsAfterAgg.add(newEval); + referenceAttributes.put(attr, newEval.toAttribute()); + evalNames.put(as.name(), attr); + as = alias; + } + } + } + + evalsBeforeAgg.add(as); evalNames.put(as.name(), attr); newGroupings.set(i, attr); if (asChild instanceof GroupingFunction.EvaluatableGroupingFunction gf) { @@ -77,7 +157,7 @@ protected LogicalPlan rule(Aggregate aggregate) { // map to track common expressions Map expToAttribute = new HashMap<>(); - for (Alias a : evals) { + for (Alias a : evalsBeforeAgg) { expToAttribute.put(a.child().canonical(), a.toAttribute()); } @@ -102,7 +182,7 @@ protected LogicalPlan rule(Aggregate aggregate) { // look for the aggregate function var replaced = child.transformUp( AggregateFunction.class, - af -> transformAggregateFunction(af, expToAttribute, evals, counter, aggsChanged) + af -> transformAggregateFunction(af, expToAttribute, evalsBeforeAgg, counter, aggsChanged) ); // replace any evaluatable grouping functions with their references pointing to the added synthetic eval replaced = replaced.transformDown(GroupingFunction.EvaluatableGroupingFunction.class, gf -> { @@ -114,17 +194,39 @@ protected LogicalPlan rule(Aggregate aggregate) { return as.replaceChild(replaced); }); + if (groupingChanged) { + Attribute ref = null; + if (agg instanceof ReferenceAttribute ra) { + // stats + ref = evalNames.get(ra.name()); + } else if (agg instanceof Alias alias) { + // inline stats + ref = evalNames.get(alias.toAttribute().name()); + } + + if (ref != null) { + aggsChanged.set(true); + newAggs.add(ref); + newProjections.add(referenceAttributes.getOrDefault(ref, ref.toAttribute())); + continue; + } + } newAggs.add(a); + newProjections.add(a.toAttribute()); } - if (evals.size() > 0) { + if (evalsBeforeAgg.size() > 0) { var groupings = groupingChanged ? newGroupings : aggregate.groupings(); var aggregates = aggsChanged.get() ? newAggs : aggregate.aggregates(); - var newEval = new Eval(aggregate.source(), aggregate.child(), evals); + var newEval = new Eval(aggregate.source(), aggregate.child(), evalsBeforeAgg); aggregate = aggregate.with(newEval, groupings, aggregates); } + if (evalsAfterAgg.size() > 0) { + Eval eval = new Eval(aggregate.source(), aggregate, evalsAfterAgg); + return new Project(aggregate.source(), eval, newProjections); + } return aggregate; } @@ -193,4 +295,175 @@ private static Expression transformAggregateFunction( private static String syntheticName(Expression expression, Expression func, int counter) { return TemporaryNameUtils.temporaryName(expression, func, counter); } + + /** + * Attempts to infer the minimal time interval that corresponds to a given date format. + *

+ * The idea is to map {@code DATE_FORMAT} patterns to the smallest truncation unit + * (year, month, day, hour, minute, second, millisecond, quarter) so that we can optimize + * {@code DATE_FORMAT} by rewriting it as {@code DATE_TRUNC} when possible. + *

+ * Limitations: + *

    + *
  • The format must represent a continuous hierarchy of time units starting from "year". + * For example: {@code yyyy-MM-dd HH:mm:ss} is valid, but skipping "month" while using "day" + * is not.
  • + *
  • Quarter optimization is a special case: it's only applied when the format contains + * only year and quarter fields (e.g., "yyyy-Q").
  • + *
  • Patterns involving unsupported fields (e.g., ERA, DAY_OF_WEEK, AM/PM, nanoseconds) + * cannot be mapped to {@code DATE_TRUNC} and will return {@code null}.
  • + *
  • Nanosecond-level precision is not supported by {@code DATE_TRUNC}.
  • + *
+ * + * @param format The date format pattern (e.g., "yyyy-MM-dd HH:mm:ss"). + * @param source The source of the query. + * @return The corresponding minimal interval as a {@link Literal}, or {@code null} if the format + * cannot be represented as a truncation unit. + * @see EsqlDataTypeConverter.INTERVALS for supported truncation units. + */ + public static Literal inferTruncIntervalFromFormat(String format, Source source) { + try { + DateTimeFormatter formatter = DateTimeFormatter.ofPattern(format, Locale.ROOT); + String formatterAsString = formatter.toString(); + // Not supported to be converted to interval + if (formatterAsString.contains("Text(" + ChronoField.ERA) // G + || formatterAsString.contains("Value(" + JulianFields.MODIFIED_JULIAN_DAY) // g + || formatterAsString.contains("Value(" + ChronoField.ALIGNED_WEEK_OF_MONTH) // f + || formatterAsString.contains("Text(" + ChronoField.DAY_OF_WEEK) // E + || formatterAsString.contains("Localized(" + ChronoField.DAY_OF_WEEK) // c/e + || formatterAsString.contains("Text(" + ChronoField.AMPM_OF_DAY) // a + || formatterAsString.contains("Value(" + ChronoField.HOUR_OF_AMPM) // K + || formatterAsString.contains("Value(" + ChronoField.CLOCK_HOUR_OF_AMPM) // h + // nanosecond interval not supported in DATE_TRUNC + || formatterAsString.contains("Fraction(" + ChronoField.NANO_OF_SECOND) // S + || formatterAsString.contains("Value(" + ChronoField.NANO_OF_SECOND) // n + || formatterAsString.contains("Value(" + ChronoField.NANO_OF_DAY) // N + // others + || formatterAsString.contains("ZoneText(FULL)") // zzzz/vvvv + || formatterAsString.contains("ZoneText(SHORT)") // z/v + || formatterAsString.contains("ZoneId()") // VV + || formatterAsString.contains("Offset(+HHMM,'+0000')") // Z/xx + || formatterAsString.contains("LocalizedOffset(FULL)") // ZZZZ/OOOO + || formatterAsString.contains("Offset(+HH:MM:ss,'Z')") // ZZZZZ/XXXXX + || formatterAsString.contains("LocalizedOffset(SHORT)") // O + || formatterAsString.contains("Offset(+HHmm,'Z')") // X + || formatterAsString.contains("Offset(+HHMM,'Z')") // XX + || formatterAsString.contains("Offset(+HH:MM,'Z')") // XXX + || formatterAsString.contains("Offset(+HHMMss,'Z')") // XXXX + || formatterAsString.contains("Offset(+HHmm,'+00')") // x + || formatterAsString.contains("Offset(+HH:MM,'+00:00')") // xxx + || formatterAsString.contains("Offset(+HHMMss,'+0000')") // xxxx + || formatterAsString.contains("Offset(+HH:MM:ss,'+00:00')") // xxxxx + || formatterAsString.contains("Localized(WeekOfMonth,1)") // W + || formatterAsString.contains("Localized(WeekOfWeekBasedYear,1)") // w + || formatterAsString.contains("Localized(WeekOfWeekBasedYear,2)") // ww + || formatterAsString.contains("DayPeriod(SHORT)") // B + || formatterAsString.contains("DayPeriod(FULL)") // BBBB + || formatterAsString.contains("DayPeriod(NARROW)")) {// BBBBB + return null; + } + + // Define the hierarchy of time units, starting from year and gradually decreasing. + // 0: year, 1: month, 2: day, 3: hour, 4: minute, 5: second, 6: millisecond + boolean[] levels = new boolean[7]; + boolean hasQuarter = false; + // year + // y/u/Y + if (formatterAsString.contains("Value(" + ChronoField.YEAR_OF_ERA) + || formatterAsString.contains("Value(" + ChronoField.YEAR) + || formatterAsString.contains("Localized(WeekBasedYear")) { + levels[0] = true; + } + + // quarter + // Q/q + if (formatterAsString.contains("Value(" + IsoFields.QUARTER_OF_YEAR) + || formatterAsString.contains("Text(" + IsoFields.QUARTER_OF_YEAR)) { + hasQuarter = true; + } + + // month + // M/L + if (formatterAsString.contains("Value(" + ChronoField.MONTH_OF_YEAR) + || formatterAsString.contains("Text(" + ChronoField.MONTH_OF_YEAR)) { + levels[1] = true; + } + + // day + // d + if (formatterAsString.contains("Value(" + ChronoField.DAY_OF_MONTH)) { + levels[2] = true; + } + // D + if (formatterAsString.contains("Value(" + ChronoField.DAY_OF_YEAR)) { + levels[1] = true; + levels[2] = true; + } + + // hour + // H/k + if (formatterAsString.contains("Value(" + ChronoField.HOUR_OF_DAY) + || formatterAsString.contains("Value(" + ChronoField.CLOCK_HOUR_OF_DAY)) { + levels[3] = true; + } + + // minute + // m + if (formatterAsString.contains("Value(" + ChronoField.MINUTE_OF_HOUR)) { + levels[4] = true; + } + + // second + // s + if (formatterAsString.contains("Value(" + ChronoField.SECOND_OF_MINUTE)) { + levels[5] = true; + } + + // millisecond + // A + if (formatterAsString.contains("Value(" + ChronoField.MILLI_OF_DAY)) { + levels[3] = true; + levels[4] = true; + levels[5] = true; + levels[6] = true; + } + + // Check for continuity + int lastLevel = -1; + for (int i = 0; i < levels.length; i++) { + if (levels[i]) { + if (lastLevel == i - 1) { + lastLevel = i; + } else { + return null; // Not continuous, return null + } + } + } + + // Special case: when format contains only year and quarter fields (e.g., "yyyy-Q"), + // return a 3-month period to represent quarterly truncation at the year level + if (lastLevel == 0 && hasQuarter) { + return new Literal(source, Period.ofMonths(3), DataType.DATE_PERIOD); + } + + // Return the smallest time unit. + switch (lastLevel) { + case 0: + return new Literal(source, Period.ofYears(1), DataType.DATE_PERIOD); + case 1: + return new Literal(source, Period.ofMonths(1), DataType.DATE_PERIOD); + case 2: + return new Literal(source, Period.ofDays(1), DataType.DATE_PERIOD); + case 3: + return new Literal(source, ChronoUnit.HOURS.getDuration(), DataType.TIME_DURATION); + case 4: + return new Literal(source, ChronoUnit.MINUTES.getDuration(), DataType.TIME_DURATION); + case 5: + return new Literal(source, ChronoUnit.SECONDS.getDuration(), DataType.TIME_DURATION); + case 6: + return new Literal(source, ChronoUnit.MILLIS.getDuration(), DataType.TIME_DURATION); + } + } catch (IllegalArgumentException ignored) {} + return null; + } } diff --git a/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/expression/function/scalar/date/DateFormatToTruncIntervalTests.java b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/expression/function/scalar/date/DateFormatToTruncIntervalTests.java new file mode 100644 index 0000000000000..70107e23d5bde --- /dev/null +++ b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/expression/function/scalar/date/DateFormatToTruncIntervalTests.java @@ -0,0 +1,183 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.esql.expression.function.scalar.date; + +import org.elasticsearch.test.ESTestCase; +import org.elasticsearch.xpack.esql.core.expression.Literal; +import org.elasticsearch.xpack.esql.core.tree.Source; + +import java.util.List; +import java.util.Objects; + +import static org.elasticsearch.xpack.esql.optimizer.rules.logical.ReplaceAggregateNestedExpressionWithEval.inferTruncIntervalFromFormat; + +public class DateFormatToTruncIntervalTests extends ESTestCase { + + public void testYear() { + List formats = List.of("y", "yy", "yyy", "yyyy", "yyyyyyyyyy", "u", "uu", "uuu", "uuuuuu"); + test(formats, "P1Y"); + } + + public void testQuarter() { + List formats = List.of("yyyy-Q", "yyyy-QQ", "yyyy-q", "yyyy-qq", "yyyy-qqq", "yyyy-qqqq"); + test(formats, "P3M"); + } + + public void testMonth() { + List formats = List.of( + "yyyy-MM", + "y-M", + "u-L", + "yyyy/MM", + "y M", + // Edge cases with different month patterns + "yyyy-MMM", + "yyyy-MMMM", // Text month names should still work + "yyyy-LL", + "yyyy-LLL", + "yyyy-LLLL" // Standalone month names + ); + test(formats, "P1M"); + } + + public void testDay() { + List formats = List.of( + "yyyy-MM-dd", + "y-M-d", + "yyyy/MM/dd", + "yyyy MM dd", + // Edge cases with different day patterns + "yyyy-MM-d", + "yyyy-M-dd", // Mixed single/double digits + // Different separators + "yyyy_MM_dd", + "yyyy:MM:dd", + // Day of year (D) should map to day level but requires year and month context + "yyyy-D", + "y-DDD" + ); + test(formats, "P1D"); + } + + public void testHour() { + List formats = List.of( + "yyyy-MM-dd HH", + "y-M-d H", + "yyyy-MM-dd kk", + "yyyy/MM/dd k", + // Edge cases with different hour patterns + "yyyy-MM-dd'T'HH", // ISO format with literal 'T' + // Single digit hours + "yyyy-MM-dd H", + "yyyy-MM-dd k" + ); + test(formats, "PT1H"); + } + + public void testMinute() { + List formats = List.of( + "yyyy-MM-dd HH:mm", + "y-M-d H:m", + "yyyy-MM-dd kk:mm", + // Edge cases with different minute patterns + "yyyy-MM-dd'T'HH:mm", // ISO format with literal 'T' + // Different separators + "yyyy-MM-dd H.mm", + "yyyy-MM-dd HH mm" + ); + test(formats, "PT1M"); + } + + public void testSecond() { + List formats = List.of( + "yyyy-MM-dd HH:mm:ss", + "y-M-d H:m:s", + "yyyy-MM-dd kk:mm:ss", + // Complex valid patterns that map to second + "yyyy-MM-dd'T'HH:mm:ss", // ISO format + "yyyy/MM/dd HH:mm:ss", + "yyyy.MM.dd.HH.mm.ss", + // Edge cases with different second patterns + "yyyy-MM-dd HH mm ss", // Space separators + "yyyy-MM-dd'T'H:m:s", // ISO with single digits + "yyyy-MM-dd HH.mm.ss" // Mixed separators + ); + test(formats, "PT1S"); + } + + public void testMillisecond() { + // Millisecond of day (A) should map to millisecond level + List formats = List.of("yyyy-MM-dd A"); + test(formats, "PT0.001S"); + } + + public void testUnsupportedPatterns() { + // Test patterns that should return null due to unsupported fields + List unsupportedFormats = List.of( + // ERA + "G yyyy", + "GGGG yyyy", + // Quarter + "yyyy-Q-h", + "yyyy-QQ-d", + // Week fields + "yyyy-w", + "yyyy-ww", + "yyyy-W", + // Day of week + "yyyy-MM-dd E", + "yyyy-MM-dd EEEE", + "yyyy-MM-dd c", + "yyyy-MM-dd e", + // AM/PM + "yyyy-MM-dd a", + // 12-hour formats + "yyyy-MM-dd h:mm", + "yyyy-MM-dd K:mm", + // Nanoseconds + "yyyy-MM-dd HH:mm:ss.S", + "yyyy-MM-dd HH:mm:ss.n", + "yyyy-MM-dd N", + // Timezone fields + "yyyy-MM-dd HH:mm:ss z", + "yyyy-MM-dd HH:mm:ss Z", + "yyyy-MM-dd HH:mm:ss X", + "yyyy-MM-dd HH:mm:ss VV", + "yyyy-MM-dd HH:mm:ss O", + // Day period + "yyyy-MM-dd HH:mm B", + // Modified Julian Day + "g", + // Non-continuous hierarchy patterns + "yyyy-dd", // year + day (skips month) + "yyyy HH", // year + hour (skips month and day) + "MM-dd", // month + day (skips year) + "HH:mm", // hour + minute (skips year, month, day) + "yyyy mm", // year + minute (skips month, day, hour) + "DD", // day of year only (missing year context) + "mm MM DD", // invalid order/hierarchy + // Invalid patterns + "invalid", + "xyz", + "yyyy-MM-dd-invalid", + "" // empty string + ); + + for (String format : unsupportedFormats) { + assertNull("Format '" + format + "' should return null", inferTruncIntervalFromFormat(format, Source.EMPTY)); + } + } + + private static void test(List formats, String expected) { + for (String format : formats) { + Literal literal = inferTruncIntervalFromFormat(format, Source.EMPTY); + assertEquals("Format '" + format + "' should return " + expected, expected, Objects.requireNonNull(literal).toString()); + } + } + +} diff --git a/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/optimizer/LogicalPlanOptimizerTests.java b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/optimizer/LogicalPlanOptimizerTests.java index 87942f4f25f00..baa1c53df2966 100644 --- a/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/optimizer/LogicalPlanOptimizerTests.java +++ b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/optimizer/LogicalPlanOptimizerTests.java @@ -66,6 +66,8 @@ import org.elasticsearch.xpack.esql.expression.function.scalar.convert.ToInteger; import org.elasticsearch.xpack.esql.expression.function.scalar.convert.ToLong; import org.elasticsearch.xpack.esql.expression.function.scalar.convert.ToString; +import org.elasticsearch.xpack.esql.expression.function.scalar.date.DateFormat; +import org.elasticsearch.xpack.esql.expression.function.scalar.date.DateTrunc; import org.elasticsearch.xpack.esql.expression.function.scalar.internal.PackDimension; import org.elasticsearch.xpack.esql.expression.function.scalar.internal.UnpackDimension; import org.elasticsearch.xpack.esql.expression.function.scalar.math.Round; @@ -6617,6 +6619,257 @@ public void testInlineStatsWithShadowedOutput() { var localRelation = as(limit.child(), LocalRelation.class); } + /** + * Project[[_meta_field{f}#14, emp_no{f}#8, first_name{f}#9, gender{f}#10, hire_date{f}#15, job{f}#16, job.raw{f}#17, lan + * guages{f}#11, last_name{f}#12, long_noidx{f}#18, salary{f}#13, avg_salary{r}#4, year_hired{r}#7]] + * \_Eval[[DATEFORMAT(yyyy[KEYWORD],year_hired{r$}#19) AS year_hired#7]] + * \_Limit[1000[INTEGER],false] + * \_InlineJoin[LEFT,[year_hired{r$}#19],[year_hired{r$}#19]] + * |_Eval[[DATETRUNC(P1Y[DATE_PERIOD],hire_date{f}#15) AS year_hired#19]] + * | \_EsRelation[test][_meta_field{f}#14, emp_no{f}#8, first_name{f}#9, ge..] + * \_Project[[avg_salary{r}#4, year_hired{r$}#19]] + * \_Eval[[$$SUM$avg_salary$0{r$}#20 / $$COUNT$avg_salary$1{r$}#21 AS avg_salary#4]] + * \_Aggregate[[year_hired{r$}#19],[SUM(salary{f}#13,true[BOOLEAN],compensated[KEYWORD]) AS $$SUM$avg_salary$0#20, COUNT(salar + * y{f}#13,true[BOOLEAN]) AS $$COUNT$avg_salary$1#21, year_hired{r$}#19]] + * \_StubRelation[[_meta_field{f}#14, emp_no{f}#8, first_name{f}#9, gender{f}#10, hire_date{f}#15, job{f}#16, job.raw{f}#17, + * languages{f}#11, last_name{f}#12, long_noidx{f}#18, salary{f}#13, year_hired{r$}#19]] + */ + public void testInlineStatsDateFormatOptimization() { + var query = """ + FROM test + | INLINE STATS avg_salary = AVG(salary) BY year_hired = DATE_FORMAT("yyyy", hire_date) + """; + if (releaseBuildForInlineStats(query)) { + return; + } + var optimized = optimizedPlan(query); + + var project = as(optimized, Project.class); + var eval = as(project.child(), Eval.class); + // Should have DATE_FORMAT conversion from optimized DATE_TRUNC + assertThat(eval.fields(), hasSize(1)); // year_hired conversion + var dateFormat = as(eval.fields().getFirst().child(), DateFormat.class); + + var limit = as(eval.child(), Limit.class); + var inlineJoin = as(limit.child(), InlineJoin.class); + + // Left side: original data with DATE_TRUNC + var leftEval = as(inlineJoin.left(), Eval.class); + assertThat(leftEval.fields(), hasSize(1)); // DATE_TRUNC operation + var dateTrunc = as(leftEval.fields().getFirst().child(), DateTrunc.class); + var leftSource = as(leftEval.child(), EsRelation.class); + + // Right side: aggregation + var rightProject = as(inlineJoin.right(), Project.class); + var rightEval = as(rightProject.child(), Eval.class); + assertThat(rightEval.fields(), hasSize(1)); // avg_salary calculation + var rightAgg = as(rightEval.child(), Aggregate.class); + var rightSource = as(rightAgg.child(), StubRelation.class); + } + + /** + * Project[[_meta_field{f}#19, emp_no{f}#13, first_name{f}#14, gender{f}#15, hire_date{f}#20, job{f}#21, job.raw{f}#22, l + * anguages{f}#16, last_name{f}#17, long_noidx{f}#23, salary{f}#18, count{r}#3, year_opt{r}#6, not_opt{r}#9, month_opt{r}#12]] + * \_Eval[[DATEFORMAT(yyyy[KEYWORD],year_opt{r$}#24) AS year_opt#6, DATEFORMAT(yyyy-MM[KEYWORD],month_opt{r$}#25) AS mon + * th_opt#12]] + * \_Limit[1000[INTEGER],false] + * \_InlineJoin[LEFT,[year_opt{r$}#24, not_opt{r}#9, month_opt{r$}#25],[year_opt{r$}#24, not_opt{r}#9, month_opt{r$}#25]] + * |_Eval[[DATETRUNC(P1Y[DATE_PERIOD],hire_date{f}#20) AS year_opt#24, DATEFORMAT(yyyy-m[KEYWORD],hire_date{f}#20) AS no + * t_opt#9, DATETRUNC(P1M[DATE_PERIOD],hire_date{f}#20) AS month_opt#25]] + * | \_EsRelation[test][_meta_field{f}#19, emp_no{f}#13, first_name{f}#14, ..] + * \_Aggregate[[year_opt{r$}#24, not_opt{r}#9, month_opt{r$}#25],[COUNT(*[KEYWORD],true[BOOLEAN]) AS count#3, year_opt{r$}#24, + * not_opt{r}#9, month_opt{r$}#25]] + * \_StubRelation[[_meta_field{f}#19, emp_no{f}#13, first_name{f}#14, gender{f}#15, hire_date{f}#20, job{f}#21, job.raw{f}#22, l + * anguages{f}#16, last_name{f}#17, long_noidx{f}#23, salary{f}#18, year_opt{r$}#24, not_opt{r}#9, month_opt{r$}#25]] + */ + public void testInlineStatsMixedDateFormatOptimization() { + var query = """ + FROM test + | INLINE STATS count = COUNT(*) BY + year_opt = DATE_FORMAT("yyyy", hire_date), + not_opt = DATE_FORMAT("yyyy-m", hire_date), + month_opt = DATE_FORMAT("yyyy-MM", hire_date) + """; + if (releaseBuildForInlineStats(query)) { + return; + } + var optimized = optimizedPlan(query); + + var project = as(optimized, Project.class); + var eval = as(project.child(), Eval.class); + // Should have 2 DATE_FORMAT conversions (for optimized fields only) + assertThat(eval.fields(), hasSize(2)); // year_opt and month_opt conversions + + var limit = as(eval.child(), Limit.class); + var inlineJoin = as(limit.child(), InlineJoin.class); + + // Left side: should have mixed DATE_TRUNC and DATE_FORMAT operations + var leftEval = as(inlineJoin.left(), Eval.class); + assertThat(leftEval.fields(), hasSize(3)); // 2 DATE_TRUNC + 1 DATE_FORMAT + + // First field: DATE_TRUNC for year (optimized) + var dateTrunc1 = as(leftEval.fields().get(0).child(), DateTrunc.class); + + // Second field: DATE_FORMAT for not_opt (not optimizable) + var dateFormat = as(leftEval.fields().get(1).child(), DateFormat.class); + + // Third field: DATE_TRUNC for month (optimized) + var dateTrunc2 = as(leftEval.fields().get(2).child(), DateTrunc.class); + + var leftSource = as(leftEval.child(), EsRelation.class); + + // Right side: aggregation + var rightAgg = as(inlineJoin.right(), Aggregate.class); + assertThat(rightAgg.groupings(), hasSize(3)); // 3 grouping fields + var rightSource = as(rightAgg.child(), StubRelation.class); + } + + /** + * Limit[1000[INTEGER],false] + * \_InlineJoin[LEFT,[format{r}#7],[format{r}#7]] + * |_Eval[[DATEFORMAT(yyyy-dd[KEYWORD],hire_date{f}#15) AS format#7]] + * | \_EsRelation[test][_meta_field{f}#14, emp_no{f}#8, first_name{f}#9, ge..] + * \_Project[[avg_salary{r}#4, format{r}#7]] + * \_Eval[[$$SUM$avg_salary$0{r$}#19 / $$COUNT$avg_salary$1{r$}#20 AS avg_salary#4]] + * \_Aggregate[[format{r}#7],[SUM(salary{f}#13,true[BOOLEAN],compensated[KEYWORD]) AS $$SUM$avg_salary$0#19, COUNT(salary{f}#1 + * 3,true[BOOLEAN]) AS $$COUNT$avg_salary$1#20, format{r}#7]] + * \_StubRelation[[_meta_field{f}#14, emp_no{f}#8, first_name{f}#9, gender{f}#10, hire_date{f}#15, job{f}#16, job.raw{f}#17, lan + * guages{f}#11, last_name{f}#12, long_noidx{f}#18, salary{f}#13, format{r}#7]] + */ + public void testInlineStatsNonOptimizableDateFormat() { + var query = """ + FROM test + | INLINE STATS avg_salary = AVG(salary) BY format = DATE_FORMAT("yyyy-dd", hire_date) + """; + if (releaseBuildForInlineStats(query)) { + return; + } + var optimized = optimizedPlan(query); + + var limit = as(optimized, Limit.class); + var inlineJoin = as(limit.child(), InlineJoin.class); + + // Left side: should have DATE_FORMAT operation (non-optimizable format) + var leftEval = as(inlineJoin.left(), Eval.class); + assertThat(leftEval.fields(), hasSize(1)); // format conversion + var dateFormat = as(leftEval.fields().getFirst().child(), DateFormat.class); + var leftSource = as(leftEval.child(), EsRelation.class); + + // Right side: aggregation + var rightProject = as(inlineJoin.right(), Project.class); + var rightEval = as(rightProject.child(), Eval.class); + assertThat(rightEval.fields(), hasSize(1)); // avg_salary calculation + var rightAgg = as(rightEval.child(), Aggregate.class); + var rightSource = as(rightAgg.child(), StubRelation.class); + } + + /** + * Project[[_meta_field{f}#1197, emp_no{f}#1191, first_name{f}#1192, gender{f}#1193, hire_date{f}#1198, job{f}#1199, job.raw{f}#1200, + * languages{f}#1194, last_name{f}#1195, long_noidx{f}#1201, salary{f}#1196, count{r}#1178, avg_salary{r}#1181, year_hired{r}#1184, + * month_hired{r}#1187, day_hired{r}#1190]] + * \_Eval[[DATEFORMAT(yyyy[KEYWORD],year_hired{r$}#1202) AS year_hired#1184, DATEFORMAT(yyyy-MM[KEYWORD],month_hired{r$} + * #1203) AS month_hired#1187, DATEFORMAT(yyyy-MM-dd[KEYWORD],day_hired{r$}#1204) AS day_hired#1190]] + * \_Limit[1000[INTEGER],false] + * \_InlineJoin[LEFT,[year_hired{r$}#1202, month_hired{r$}#1203, day_hired{r$}#1204],[year_hired{r$}#1202, month_hired{r$}#1203, + * day_hired{r$}#1204]] + * |_Eval[[DATETRUNC(P1Y[DATE_PERIOD],hire_date{f}#1198) AS year_hired#1202, DATETRUNC(P1M[DATE_PERIOD],hire_date{f}#119 + * 8) AS month_hired#1203, DATETRUNC(P1D[DATE_PERIOD],hire_date{f}#1198) AS day_hired#1204]] + * | \_EsRelation[test][_meta_field{f}#1197, emp_no{f}#1191, first_name{f}#..] + * \_Project[[count{r}#1178, avg_salary{r}#1181, year_hired{r$}#1202, month_hired{r$}#1203, day_hired{r$}#1204]] + * \_Eval[[$$SUM$avg_salary$0{r$}#1205 / $$COUNT$avg_salary$1{r$}#1206 AS avg_salary#1181]] + * \_Aggregate[[year_hired{r$}#1202, month_hired{r$}#1203, day_hired{r$}#1204],[COUNT(*[KEYWORD],true[BOOLEAN]) AS count#1178, + * SUM(salary{f}#1196,true[BOOLEAN],compensated[KEYWORD]) AS $$SUM$avg_salary$0#1205, COUNT(salary{f}#1196,true[BOOLEAN]) AS $$COUNT$ + * avg_salary$1#1206, year_hired{r$}#1202, month_hired{r$}#1203, day_hired{r$}#1204]] + * \_StubRelation[[_meta_field{f}#1197, emp_no{f}#1191, first_name{f}#1192, gender{f}#1193, hire_date{f}#1198, job{f}#1199, + * job.raw{f}#1200, languages{f}#1194, last_name{f}#1195, long_noidx{f}#1201, salary{f}#1196, year_hired{r$}#1202, month_hired{r$}#1203, + * day_hired{r$}#1204]] + */ + public void testInlineStatsMultipleDateFormatOptimizations() { + var query = """ + FROM test + | INLINE STATS + count = COUNT(*), + avg_salary = AVG(salary) + BY + year_hired = DATE_FORMAT("yyyy", hire_date), + month_hired = DATE_FORMAT("yyyy-MM", hire_date), + day_hired = DATE_FORMAT("yyyy-MM-dd", hire_date) + """; + if (releaseBuildForInlineStats(query)) { + return; + } + var optimized = optimizedPlan(query); + + var project = as(optimized, Project.class); + var eval = as(project.child(), Eval.class); + // Should have 3 DATE_FORMAT conversions + assertThat(eval.fields(), hasSize(3)); // 3 DATE_FORMAT conversions + + var limit = as(eval.child(), Limit.class); + var inlineJoin = as(limit.child(), InlineJoin.class); + + // Left side: should have 3 DATE_TRUNC operations + var leftEval = as(inlineJoin.left(), Eval.class); + assertThat(leftEval.fields(), hasSize(3)); // 3 DATE_TRUNC operations + + // All should be DATE_TRUNC operations + for (int i = 0; i < 3; i++) { + var dateTrunc = as(leftEval.fields().get(i).child(), DateTrunc.class); + } + + var leftSource = as(leftEval.child(), EsRelation.class); + + // Right side: aggregation + var rightProject = as(inlineJoin.right(), Project.class); + var rightEval = as(rightProject.child(), Eval.class); + assertThat(rightEval.fields(), hasSize(1)); // avg_salary calculation + var rightAgg = as(rightEval.child(), Aggregate.class); + assertThat(rightAgg.groupings(), hasSize(3)); // 3 grouping fields + var rightSource = as(rightAgg.child(), StubRelation.class); + } + + /** + * Project[[_meta_field{f}#13, emp_no{f}#7, first_name{f}#8, gender{f}#9, hire_date{f}#14, job{f}#15, job.raw{f}#16, lang + * uages{f}#10, last_name{f}#11, long_noidx{f}#17, salary{f}#12, count{r}#3, formatted_date{r}#6]] + * \_Eval[[DATEFORMAT('Year:'yyyy'-Month:'MM[KEYWORD],formatted_date{r$}#18) AS formatted_date#6]] + * \_Limit[1000[INTEGER],false] + * \_InlineJoin[LEFT,[formatted_date{r$}#18],[formatted_date{r$}#18]] + * |_Eval[[DATETRUNC(P1M[DATE_PERIOD],hire_date{f}#14) AS formatted_date#18]] + * | \_EsRelation[test][_meta_field{f}#13, emp_no{f}#7, first_name{f}#8, ge..] + * \_Aggregate[[formatted_date{r$}#18],[COUNT(*[KEYWORD],true[BOOLEAN]) AS count#3, formatted_date{r$}#18]] + * \_StubRelation[[_meta_field{f}#13, emp_no{f}#7, first_name{f}#8, gender{f}#9, hire_date{f}#14, job{f}#15, job.raw{f}#16, lang + * uages{f}#10, last_name{f}#11, long_noidx{f}#17, salary{f}#12, formatted_date{r$}#18]] + */ + public void testInlineStatsComplexDateFormatOptimization() { + var query = """ + FROM test + | INLINE STATS count = COUNT(*) BY formatted_date = DATE_FORMAT("'Year:'yyyy'-Month:'MM", hire_date) + """; + if (releaseBuildForInlineStats(query)) { + return; + } + var optimized = optimizedPlan(query); + + var project = as(optimized, Project.class); + var eval = as(project.child(), Eval.class); + // Should have DATE_FORMAT conversion + assertThat(eval.fields(), hasSize(1)); // formatted_date conversion + var dateFormat = as(eval.fields().get(0).child(), DateFormat.class); + + var limit = as(eval.child(), Limit.class); + var inlineJoin = as(limit.child(), InlineJoin.class); + + // Left side: should have DATE_TRUNC optimization + var leftEval = as(inlineJoin.left(), Eval.class); + assertThat(leftEval.fields(), hasSize(1)); // DATE_TRUNC operation + var dateTrunc = as(leftEval.fields().getFirst().child(), DateTrunc.class); + var leftSource = as(leftEval.child(), EsRelation.class); + + // Right side: aggregation + var rightAgg = as(inlineJoin.right(), Aggregate.class); + var rightSource = as(rightAgg.child(), StubRelation.class); + } + /** * Expects * @@ -8869,6 +9122,656 @@ public void testSampleNoPushDownChangePoint() { var source = as(topN.child(), EsRelation.class); } + /** + * Project[[avg{r}#7, date{r}#4]] + * \_Eval[[$$SUM$avg$0{r$}#21 / $$COUNT$avg$1{r$}#22 AS avg#7, DATEFORMAT(yyyy[KEYWORD],date{r$}#20) AS date#4]] + * \_Limit[1000[INTEGER],false] + * \_Aggregate[[date{r$}#20],[SUM(salary{f}#14,true[BOOLEAN],compensated[KEYWORD]) AS $$SUM$avg$0#21, COUNT(salary{f}#14,true[ + * BOOLEAN]) AS $$COUNT$avg$1#22, date{r$}#20]] + * \_Eval[[DATETRUNC(P1Y[DATE_PERIOD],hire_date{f}#16) AS date#20]] + * \_EsRelation[test][_meta_field{f}#15, emp_no{f}#9, first_name{f}#10, g..] + */ + public void testReplaceGroupingByDateFormatWithDateTrunc() { + + List formats = List.of( + // Original formats + "yyyy", + "YYYY", + "MM/yyyy", + "yyyy-dd-MM", + "yyyy-MM-dd HH:mm:ss", + // Additional year formats + "y", + "yy", + "yyy", + "u", + "uu", + "uuu", + // Quarter patterns + "yyyy-Q", + "yyyy-QQ", + "yyyy-q", + "yyyy-qq", + "yyyy-qqqq-MM", + // Week-based year + "YYYY-MM-dd", + // Additional month formats + "yyyy-MM", + "yyyy/MM", + "yyyy MM", + "yyyy-M", + "yyyy-MMM", + "yyyy-MMMM", + "yyyy-L", + "yyyy-LLL", + "yyyy-LLLL", + // Additional day formats + "yyyy-MM-dd", + "yyyy/MM/dd", + "yyyy.MM.dd", + "yyyy_MM_dd", + "yyyy:MM:dd", + "yyyy-M-d", + "yyyy-MM-d", + "yyyy-M-dd", + // Day of year formats + "yyyy-D", + "yyyy-DDD", + // Hour formats + "yyyy-MM-dd HH", + "yyyy-MM-dd H", + "yyyy-MM-dd kk", + "yyyy-MM-dd k", + "yyyy-MM-dd'T'HH", + "yyyy/MM/dd H", + // Minute formats + "yyyy-MM-dd HH:mm", + "yyyy-MM-dd H:m", + "yyyy-MM-dd kk:mm", + "yyyy-MM-dd'T'HH:mm", + "yyyy-MM-dd H.mm", + "yyyy-MM-dd HH mm", + // Second formats + "yyyy-MM-dd HH:mm:ss", + "yyyy-MM-dd H:m:s", + "yyyy-MM-dd kk:mm:ss", + "yyyy-MM-dd'T'HH:mm:ss", + "yyyy/MM/dd HH:mm:ss", + "yyyy.MM.dd.HH.mm.ss", + "yyyy-MM-dd HH mm ss", + "yyyy-MM-dd'T'H:m:s", + "yyyy-MM-dd HH.mm.ss", + // Millisecond formats + "yyyy-MM-dd A", + // Complex formats with literals + "yyyy'-'MM'-'dd", + "yyyy'年'MM'月'dd'日'", + "yyyy'/'MM'/'dd", + "'Year:'yyyy'-Month:'MM" + ); + + var queries = List.of(""" + FROM test + | STATS avg = AVG(integer) BY date = DATE_FORMAT("%s", date) + """, """ + FROM test + | STATS avg = AVG(integer) BY date = DATE_FORMAT("%s", date_nanos) + """); + + for (var format : formats) { + for (var query : queries) { + String format1 = String.format(Locale.ROOT, query, format); + var optimized = planTypes(format1); + + var project = as(optimized, Project.class); + var eval = as(project.child(), Eval.class); + assertThat(eval.fields(), hasSize(2)); + var dateformat = as(eval.fields().get(1).child(), DateFormat.class); + + var limit = as(eval.child(), Limit.class); + var agg = as(limit.child(), Aggregate.class); + var ref = as(agg.groupings().getFirst(), ReferenceAttribute.class); + + var eval2 = as(agg.child(), Eval.class); + assertThat(eval2.fields(), hasSize(1)); + var dateTrunc = as(eval2.fields().getFirst().child(), DateTrunc.class); + assertThat(eval2.fields().getFirst().toAttribute(), is(ref)); + + var source = as(eval2.child(), EsRelation.class); + } + } + } + + /** + * Project[[avg{r}#13, date{r}#4, date2{r}#7, date3{r}#10]] + * \_Eval[[$$SUM$avg$0{r$}#30 / $$COUNT$avg$1{r$}#31 AS avg#13, DATEFORMAT(y-M-dd[KEYWORD],date{r$}#28) AS date#4, DATEF + * ORMAT(u-MMM[KEYWORD],date3{r$}#29) AS date3#10]] + * \_Limit[1000[INTEGER],false] + * \_Aggregate[[date{r$}#28, date2{r}#7, date3{r$}#29],[SUM(salary{f}#22,true[BOOLEAN],compensated[KEYWORD]) AS $$SUM$avg$0#30 + * , COUNT(salary{f}#22,true[BOOLEAN]) AS $$COUNT$avg$1#31, date{r$}#28, date2{r}#7, date3{r$}#29]] + * \_Eval[[DATETRUNC(P1D[DATE_PERIOD],hire_date{f}#24) AS date#28, DATEFORMAT(yyyy-MM-mm[KEYWORD],hire_date{f}#24) AS da + * te2#7, DATETRUNC(P1M[DATE_PERIOD],hire_date{f}#24) AS date3#29]] + * \_EsRelation[test][_meta_field{f}#23, emp_no{f}#17, first_name{f}#18, ..] + */ + public void testMixedDateFormatOptimizationInGroupBy() { + var query = """ + FROM test + | STATS avg = AVG(salary) + BY date = DATE_FORMAT("y-M-dd", hire_date), + date2 = DATE_FORMAT("yyyy-MM-mm", hire_date), + date3 = DATE_FORMAT("u-MMM", hire_date) + """; + var optimized = optimizedPlan(query); + + // Top level: Project with all output fields + var project = as(optimized, Project.class); + assertThat(Expressions.names(project.projections()), contains("avg", "date", "date2", "date3")); + + // Second level: Eval that computes the final avg and converts optimized DATE_TRUNC back to DATE_FORMAT + var topEval = as(project.child(), Eval.class); + assertThat(topEval.fields(), hasSize(3)); // avg calculation + 2 DATE_FORMAT conversions + + // Check that avg is computed from SUM/COUNT + var avgField = topEval.fields().get(0); + assertThat(avgField.name(), equalTo("avg")); + + // Check DATE_FORMAT conversions for optimized fields + var dateField = topEval.fields().get(1); + assertThat(dateField.name(), equalTo("date")); + var dateFormat1 = as(dateField.child(), DateFormat.class); + + var date3Field = topEval.fields().get(2); + assertThat(date3Field.name(), equalTo("date3")); + var dateFormat3 = as(date3Field.child(), DateFormat.class); + + var limit = as(topEval.child(), Limit.class); + var agg = as(limit.child(), Aggregate.class); + + // Aggregate should group by 3 fields (2 optimized DATE_TRUNC references + 1 original DATE_FORMAT) + assertThat(agg.groupings(), hasSize(3)); + assertThat(agg.aggregates(), hasSize(5)); // SUM, COUNT, and 3 grouping fields + + // Bottom level: Eval that creates the optimized DATE_TRUNC operations and non-optimizable DATE_FORMAT + var bottomEval = as(agg.child(), Eval.class); + assertThat(bottomEval.fields(), hasSize(3)); + + // First field: DATE_TRUNC for "y-M-dd" (day level) + var dateTruncField1 = bottomEval.fields().get(0); + var dateTrunc1 = as(dateTruncField1.child(), DateTrunc.class); + + // Second field: DATE_FORMAT for "yyyy-MM-mm" (cannot be optimized due to 'mm' - minute without hour) + var dateFormatField2 = bottomEval.fields().get(1); + var dateFormat2 = as(dateFormatField2.child(), DateFormat.class); + + // Third field: DATE_TRUNC for "u-MMM" (month level) + var dateTruncField3 = bottomEval.fields().get(2); + var dateTrunc3 = as(dateTruncField3.child(), DateTrunc.class); + + var source = as(bottomEval.child(), EsRelation.class); + } + + /** + * Project[[avg{r}#7, date{r}#4]] + * \_Eval[[$$SUM$avg$0{r$}#20 / $$COUNT$avg$1{r$}#21 AS avg#7]] + * \_Limit[1000[INTEGER],false] + * \_Aggregate[[date{r}#4],[SUM(salary{f}#14,true[BOOLEAN],compensated[KEYWORD]) AS $$SUM$avg$0#20, COUNT(salary{f}#14,true[BO + * OLEAN]) AS $$COUNT$avg$1#21, date{r}#4]] + * \_Eval[[DATEFORMAT(G yyyy[KEYWORD],hire_date{f}#16) AS date#4]] + * \_EsRelation[test][_meta_field{f}#15, emp_no{f}#9, first_name{f}#10, g..] + */ + public void testDateFormatNotConvertedToDateTrunc() { + List unsupportedFormats = List.of( + // ERA patterns + "G yyyy", + "GGGG yyyy", + // Quarter patterns + "yyyy-qqq-d", + // Week patterns + "yyyy-w", + "yyyy-ww", + "yyyy-W", + // Day of week patterns + "yyyy-MM-dd E", + "yyyy-MM-dd EEEE", + "yyyy-MM-dd c", + "yyyy-MM-dd e", + // AM/PM patterns + "yyyy-MM-dd a", + // 12-hour patterns + "yyyy-MM-dd h:mm", + "yyyy-MM-dd K:mm", + // Nanosecond patterns + "yyyy-MM-dd HH:mm:ss.S", + "yyyy-MM-dd HH:mm:ss.n", + "yyyy-MM-dd N", + // Timezone patterns + "yyyy-MM-dd HH:mm:ss z", + "yyyy-MM-dd HH:mm:ss Z", + "yyyy-MM-dd HH:mm:ss X", + "yyyy-MM-dd HH:mm:ss VV", + "yyyy-MM-dd HH:mm:ss O", + + // Day period + "yyyy-MM-dd HH:mm B", + // Modified Julian Day + "g", + // Non-continuous hierarchy patterns + "yyyy-dd", // year + day (skips month) + "yyyy HH", // year + hour (skips month and day) + "MM-dd", // month + day (skips year) + "HH:mm", // hour + minute (skips year, month, day) + "yyyy mm", // year + minute (skips month, day, hour) + "DD" // day of year only (missing year context) + ); + + var queries = List.of(""" + FROM test + | STATS avg = AVG(integer) BY date = DATE_FORMAT("%s", date) + """, """ + FROM test + | STATS avg = AVG(integer) BY date = DATE_FORMAT("%s", date_nanos) + """); + + for (var format : unsupportedFormats) { + for (var query : queries) { + String formatQuery = String.format(Locale.ROOT, query, format); + var optimized = planTypes(formatQuery); + + var project = as(optimized, Project.class); + var eval = as(project.child(), Eval.class); + assertThat(eval.fields(), hasSize(1)); + + var limit = as(eval.child(), Limit.class); + var agg = as(limit.child(), Aggregate.class); + var eval2 = as(agg.child(), Eval.class); + + var source = as(eval2.child(), EsRelation.class); + assertThat(source, instanceOf(EsRelation.class)); + } + } + } + + /** + * Project[[avg{r}#10, date{r}#7]] + * \_Eval[[$$SUM$avg$0{r$}#24 / $$COUNT$avg$1{r$}#25 AS avg#10, DATEFORMAT(yyyy-MM[KEYWORD],date{r$}#23) AS date#7]] + * \_Limit[1000[INTEGER],false] + * \_Aggregate[[date{r$}#23],[SUM(salary{f}#17,true[BOOLEAN],compensated[KEYWORD]) AS $$SUM$avg$0#24, COUNT(salary{f}#17,true[ + * BOOLEAN]) AS $$COUNT$avg$1#25, date{r$}#23]] + * \_Eval[[DATETRUNC(P1M[DATE_PERIOD],hire_date{f}#19) AS date#23]] + * \_EsRelation[test][_meta_field{f}#18, emp_no{f}#12, first_name{f}#13, ..] + */ + public void testStatsDateFormatOptimizationWithConcat() { + var query = """ + FROM test + | EVAL format = concat("yyyy", "-MM") + | STATS avg = AVG(salary) BY date = DATE_FORMAT(format, hire_date) + """; + var optimized = optimizedPlan(query); + + var project = as(optimized, Project.class); + + var eval = as(project.child(), Eval.class); + assertThat(eval.fields(), hasSize(2)); + + // Verify avg calculation + var avgAlias = as(eval.fields().getFirst(), Alias.class); + assertThat(avgAlias.name(), equalTo("avg")); + var avgDiv = as(avgAlias.child(), Div.class); + + // Verify date formatting - should be optimized to use literal "yyyy-MM" + var dateAlias = as(eval.fields().get(1), Alias.class); + assertThat(dateAlias.name(), equalTo("date")); + var dateFormat = as(dateAlias.child(), DateFormat.class); + var formatArg = dateFormat.arguments().getFirst(); + assertThat(formatArg, instanceOf(Literal.class)); + var formatLiteral = as(formatArg, Literal.class); + assertThat(formatLiteral.value(), equalTo(new BytesRef("yyyy-MM"))); + + var limit = as(eval.child(), Limit.class); + var agg = as(limit.child(), Aggregate.class); + + // Verify that the aggregation uses DATE_TRUNC optimization + var eval2 = as(agg.child(), Eval.class); + assertThat(eval2.fields(), hasSize(1)); + var dateTruncAlias = as(eval2.fields().getFirst(), Alias.class); + assertThat(dateTruncAlias.name(), equalTo("date")); + var dateTrunc = as(dateTruncAlias.child(), DateTrunc.class); + + var source = as(eval2.child(), EsRelation.class); + assertThat(source, instanceOf(EsRelation.class)); + } + + /** + * Limit[1000[INTEGER],false] + * \_InlineJoin[LEFT,[date{r}#10],[date{r}#10]] + * |_Eval[[yyyy-MM[KEYWORD] AS format#3, DATEFORMAT(yyyy-MM[KEYWORD],hire_date{f}#18) AS date#10]] + * | \_EsRelation[test][_meta_field{f}#17, emp_no{f}#11, first_name{f}#12, ..] + * \_Project[[avg{r}#6, date{r}#10]] + * \_Eval[[$$SUM$avg$0{r$}#22 / $$COUNT$avg$1{r$}#23 AS avg#6]] + * \_Aggregate[[date{r}#10],[SUM(salary{f}#16,true[BOOLEAN],compensated[KEYWORD]) AS $$SUM$avg$0#22, COUNT(salary{f}#16,true[B + * OOLEAN]) AS $$COUNT$avg$1#23, date{r}#10]] + * \_StubRelation[[_meta_field{f}#17, emp_no{f}#11, first_name{f}#12, gender{f}#13, hire_date{f}#18, job{f}#19, job.raw{f}#20, l + * anguages{f}#14, last_name{f}#15, long_noidx{f}#21, salary{f}#16, format{r}#3, date{r}#10]] + */ + public void testInlineStatsDateFormatOptimizationWithConcat() { + var query = """ + FROM test + | EVAL format = concat("yyyy", "-MM") + | INLINE STATS avg = AVG(salary) BY date = DATE_FORMAT(format, hire_date) + """; + var optimized = optimizedPlan(query); + + var limit = as(optimized, Limit.class); + var inlineJoin = as(limit.child(), InlineJoin.class); + + // Check left side of the join (original data with eval) + var leftEval = as(inlineJoin.left(), Eval.class); + assertThat(leftEval.fields(), hasSize(2)); + + // Verify that the format field is optimized to literal "yyyy-MM" + var formatAlias = as(leftEval.fields().getFirst(), Alias.class); + assertThat(formatAlias.name(), equalTo("format")); + var formatLiteral = as(formatAlias.child(), Literal.class); + assertThat(formatLiteral.value(), equalTo(new BytesRef("yyyy-MM"))); + + // Verify that DATE_FORMAT uses the optimized literal directly + var dateAlias = as(leftEval.fields().get(1), Alias.class); + assertThat(dateAlias.name(), equalTo("date")); + var dateFormat = as(dateAlias.child(), DateFormat.class); + var dateFormatArg = dateFormat.arguments().getFirst(); + assertThat(dateFormatArg, instanceOf(Literal.class)); + var dateFormatLiteral = as(dateFormatArg, Literal.class); + assertThat(dateFormatLiteral.value(), equalTo(new BytesRef("yyyy-MM"))); + + var leftSource = as(leftEval.child(), EsRelation.class); + assertThat(leftSource, instanceOf(EsRelation.class)); + + // Check right side of the join (aggregated stats) + var rightProject = as(inlineJoin.right(), Project.class); + + var rightEval = as(rightProject.child(), Eval.class); + assertThat(rightEval.fields(), hasSize(1)); + var avgAlias = as(rightEval.fields().getFirst(), Alias.class); + assertThat(avgAlias.name(), equalTo("avg")); + + var agg = as(rightEval.child(), Aggregate.class); + + var stubRelation = as(agg.child(), StubRelation.class); + assertThat(stubRelation, instanceOf(StubRelation.class)); + } + + /** + * Project[[sum(x){r}#11, date_format("yyyy-MM-dd", y){r}#9]] + * \_Eval[[DATEFORMAT(yyyy-MM-dd[KEYWORD],date_format("yyyy-MM-dd", y){r$}#34) AS date_format("yyyy-MM-dd", y)#9]] + * \_Limit[1000[INTEGER],false] + * \_Aggregate[[date_format("yyyy-MM-dd", y){r$}#34],[SUM(x{r}#4,true[BOOLEAN],compensated[KEYWORD]) AS sum(x)#11, date_format + * ("yyyy-MM-dd", y){r$}#34]] + * \_Eval[[TOLONG(integer{f}#23) + 10[INTEGER] AS x#4, date_nanos{f}#18 + P1D[DATE_PERIOD] + P1Y[DATE_PERIOD] - PT1H[TIM + * E_DURATION] AS y#7, DATETRUNC(P1D[DATE_PERIOD],y{r}#7) AS date_format("yyyy-MM-dd", y)#34]] + * \_EsRelation[types][!alias_integer, boolean{f}#14, byte{f}#15, constant..] + */ + public void testStatsDateFormatWithReferenceAttribute() { + var query = """ + from test + | eval x = integer::long + 10, y = date_nanos + 1 day + 1 year - 1 hour + | stats sum(x) by date_format("yyyy-MM-dd", y) + """; + var optimized = planTypes(query); + + var project = as(optimized, Project.class); + assertThat(Expressions.names(project.projections()), contains("sum(x)", "date_format(\"yyyy-MM-dd\", y)")); + + var eval = as(project.child(), Eval.class); + assertThat(eval.fields(), hasSize(1)); // DATE_FORMAT conversion + var dateFormat = as(eval.fields().get(0).child(), DateFormat.class); + + var limit = as(eval.child(), Limit.class); + var agg = as(limit.child(), Aggregate.class); + assertThat(agg.groupings(), hasSize(1)); // grouped by date_format result + assertThat(agg.aggregates(), hasSize(2)); // sum(x) + grouping field + + var aggEval = as(agg.child(), Eval.class); + assertThat(aggEval.fields(), hasSize(3)); // x, y, and DATE_TRUNC optimization + var dateTrunc = as(aggEval.fields().get(2).child(), DateTrunc.class); + + var source = as(aggEval.child(), EsRelation.class); + } + + /** + * Project[[!alias_integer, boolean{f}#13, byte{f}#14, constant_keyword-foo{f}#15, date{f}#16, date_nanos{f}#17, dense_ve + * ctor{f}#32, double{f}#18, float{f}#19, half_float{f}#20, integer{f}#22, ip{f}#23, keyword{f}#24, long{f}#25, scaled_float{f}#21, + * semantic_text{f}#31, short{f}#27, text{f}#28, unsigned_long{f}#26, version{f}#29, wildcard{f}#30, x{r}#4, y{r}#7, sum(x){r}#9, + * date_format("yyyy-MM-dd", y){r}#11]] + * \_Eval[[DATEFORMAT(yyyy-MM-dd[KEYWORD],date_format("yyyy-MM-dd", y){r$}#33) AS date_format("yyyy-MM-dd", y)#11]] + * \_Limit[1000[INTEGER],false] + * \_InlineJoin[LEFT,[date_format("yyyy-MM-dd", y){r$}#33],[date_format("yyyy-MM-dd", y){r$}#33]] + * |_Eval[[TOLONG(integer{f}#22) + 10[INTEGER] AS x#4, date_nanos{f}#17 + P1D[DATE_PERIOD] + P1Y[DATE_PERIOD] - PT1H[TIM + * E_DURATION] AS y#7, DATETRUNC(P1D[DATE_PERIOD],y{r}#7) AS date_format("yyyy-MM-dd", y)#33]] + * | \_EsRelation[types][!alias_integer, boolean{f}#13, byte{f}#14, constant..] + * \_Aggregate[[date_format("yyyy-MM-dd", y){r$}#33],[SUM(x{r}#4,true[BOOLEAN],compensated[KEYWORD]) AS sum(x)#9, date_format( + * "yyyy-MM-dd", y){r$}#33]] + * \_StubRelation[[!alias_integer, boolean{f}#13, byte{f}#14, constant_keyword-foo{f}#15, date{f}#16, date_nanos{f}#17, dense_ve + * ctor{f}#32, double{f}#18, float{f}#19, half_float{f}#20, integer{f}#22, ip{f}#23, keyword{f}#24, long{f}#25, scaled_float{f}#21, + * semantic_text{f}#31, short{f}#27, text{f}#28, unsigned_long{f}#26, version{f}#29, wildcard{f}#30, x{r}#4, y{r}#7, + * date_format("yyyy-MM-dd", y){r$}#33]] + */ + public void testInlineStatsDateFormatWithReferenceAttribute() { + var query = """ + from test + | eval x = integer::long + 10, y = date_nanos + 1 day + 1 year - 1 hour + | inline stats sum(x) by date_format("yyyy-MM-dd", y) + """; + if (releaseBuildForInlineStats(query)) { + return; + } + var optimized = planTypes(query); + + var project = as(optimized, Project.class); + var eval = as(project.child(), Eval.class); + assertThat(eval.fields(), hasSize(1)); // DATE_FORMAT conversion + var dateFormat = as(eval.fields().get(0).child(), DateFormat.class); + + var limit = as(eval.child(), Limit.class); + var inlineJoin = as(limit.child(), InlineJoin.class); + + // Left side: should have DATE_TRUNC optimization + var leftEval = as(inlineJoin.left(), Eval.class); + assertThat(leftEval.fields(), hasSize(3)); // x, y, and DATE_TRUNC optimization + var dateTrunc = as(leftEval.fields().get(2).child(), DateTrunc.class); + var leftSource = as(leftEval.child(), EsRelation.class); + + // Right side: aggregation + var rightAgg = as(inlineJoin.right(), Aggregate.class); + assertThat(rightAgg.groupings(), hasSize(1)); // grouped by date_format result + assertThat(rightAgg.aggregates(), hasSize(2)); // sum(x) + grouping field + var rightSource = as(rightAgg.child(), StubRelation.class); + } + + /** + * Project[[sum(x){r}#14, date_format(fmt, y){r}#12]] + * \_Eval[[DATEFORMAT(yyyy-MM-dd[KEYWORD],date_format(fmt, y){r$}#37) AS date_format(fmt, y)#12]] + * \_Limit[1000[INTEGER],false] + * \_Aggregate[[date_format(fmt, y){r$}#37],[SUM(x{r}#4,true[BOOLEAN],compensated[KEYWORD]) AS sum(x)#14, date_format(fmt, y){ + * r$}#37]] + * \_Eval[[TOLONG(integer{f}#26) + 10[INTEGER] AS x#4, date_nanos{f}#21 + P1D[DATE_PERIOD] + P1Y[DATE_PERIOD] - PT1H[TIM + * E_DURATION] AS y#7, DATETRUNC(P1D[DATE_PERIOD],y{r}#7) AS date_format(fmt, y)#37]] + * \_EsRelation[types][!alias_integer, boolean{f}#17, byte{f}#18, constant..] + */ + public void testStatsDateFormatWithBothReferenceAttributes() { + var query = """ + from test + | eval x = integer::long + 10, y = date_nanos + 1 day + 1 year - 1 hour, fmt = "yyyy-MM-dd" + | stats sum(x) by date_format(fmt, y) + """; + var optimized = planTypes(query); + + var project = as(optimized, Project.class); + assertThat(Expressions.names(project.projections()), contains("sum(x)", "date_format(fmt, y)")); + + var eval = as(project.child(), Eval.class); + assertThat(eval.fields(), hasSize(1)); // DATE_FORMAT conversion + var dateFormat = as(eval.fields().get(0).child(), DateFormat.class); + + var limit = as(eval.child(), Limit.class); + var agg = as(limit.child(), Aggregate.class); + assertThat(agg.groupings(), hasSize(1)); // grouped by date_format result + assertThat(agg.aggregates(), hasSize(2)); // sum(x) + grouping field + + var aggEval = as(agg.child(), Eval.class); + assertThat(aggEval.fields(), hasSize(3)); // x, y, and DATE_TRUNC optimization + var dateTrunc = as(aggEval.fields().get(2).child(), DateTrunc.class); + + var source = as(aggEval.child(), EsRelation.class); + } + + /** + * Limit[1000[INTEGER],false] + * \_InlineJoin[LEFT,[date_format(fmt, y){r}#14],[date_format(fmt, y){r}#14]] + * |_Eval[[TOLONG(integer{f}#25) + 10[INTEGER] AS x#4, date_nanos{f}#20 + P1D[DATE_PERIOD] + P1Y[DATE_PERIOD] - PT1H[TIM + * E_DURATION] AS y#7, yyyy-MM-dd[KEYWORD] AS fmt#9, DATEFORMAT(yyyy-MM-dd[KEYWORD],y{r}#7) AS date_format(fmt, y)#14]] + * | \_EsRelation[types][!alias_integer, boolean{f}#16, byte{f}#17, constant..] + * \_Aggregate[[date_format(fmt, y){r}#14],[SUM(x{r}#4,true[BOOLEAN],compensated[KEYWORD]) AS sum(x)#11, date_format(fmt, y){r + * }#14]] + * \_StubRelation[[!alias_integer, boolean{f}#16, byte{f}#17, constant_keyword-foo{f}#18, date{f}#19, date_nanos{f}#20, dense_ve + * ctor{f}#35, double{f}#21, float{f}#22, half_float{f}#23, integer{f}#25, ip{f}#26, keyword{f}#27, long{f}#28, scaled_float{f}#24, + * semantic_text{f}#34, short{f}#30, text{f}#31, unsigned_long{f}#29, version{f}#32, wildcard{f}#33, x{r}#4, y{r}#7, fmt{r}#9, + * date_format(fmt, y){r}#14]] + */ + public void testInlineStatsDateFormatWithBothReferenceAttributes() { + var query = """ + from test + | eval x = integer::long + 10, y = date_nanos + 1 day + 1 year - 1 hour, fmt = "yyyy-MM-dd" + | inline stats sum(x) by date_format(fmt, y) + """; + if (releaseBuildForInlineStats(query)) { + return; + } + var optimized = planTypes(query); + + var limit = as(optimized, Limit.class); + var inlineJoin = as(limit.child(), InlineJoin.class); + + // Left side: should have DATE_FORMAT conversion + var leftEval = as(inlineJoin.left(), Eval.class); + assertThat(leftEval.fields(), hasSize(4)); // x, y, fmt, and DATE_FORMAT conversion + var dateFormat = as(leftEval.fields().get(3).child(), DateFormat.class); + var leftSource = as(leftEval.child(), EsRelation.class); + + // Right side: aggregation + var rightAgg = as(inlineJoin.right(), Aggregate.class); + assertThat(rightAgg.groupings(), hasSize(1)); // grouped by date_format result + assertThat(rightAgg.aggregates(), hasSize(2)); // sum(x) + grouping field + var rightSource = as(rightAgg.child(), StubRelation.class); + } + + /** + * Project[[count(*){r}#7001, concat(x, "01"){r}#7003, x{r}#7000]] + * \_Eval[[CONCAT(x{r}#7000,01[KEYWORD]) AS concat(x, "01")#7003]] + * \_Limit[1000[INTEGER],false] + * \_Aggregate[[x{r}#7000],[COUNT(*[KEYWORD],true[BOOLEAN]) AS count(*)#7001, x{r}#7000]] + * \_Eval[[DATEFORMAT(y-MM-dd[KEYWORD],hire_date{f}#7012) AS x#7000]] + * \_EsRelation[test][_meta_field{f}#7011, emp_no{f}#7005, first_name{f}#..] + */ + public void testStatsCountConcatGroupingByDateFormat() { + var plan = plan(""" + from test | stats count(*), concat(x, "01") by x = date_format("y-MM-dd", hire_date) + """); + + var project = as(plan, Project.class); + + var eval = as(project.child(), Eval.class); + var concatAlias = as(eval.fields().get(0), Alias.class); + var concat = as(concatAlias.child(), Concat.class); + + var limit = as(eval.child(), Limit.class); + var agg = as(limit.child(), Aggregate.class); + + var evalInner = as(agg.child(), Eval.class); + var dateFormatAlias = as(evalInner.fields().get(0), Alias.class); + var dateFormat = as(dateFormatAlias.child(), DateFormat.class); + + var relation = as(evalInner.child(), EsRelation.class); + } + + /** + *Limit[1000[INTEGER],false] + * \_InlineJoin[LEFT,[x{r}#9486],[x{r}#9486]] + * |_Eval[[DATEFORMAT(y-MM-dd[KEYWORD],hire_date{f}#9494) AS x#9486]] + * | \_EsRelation[test][_meta_field{f}#9493, emp_no{f}#9487, first_name{f}#..] + * \_Project[[count(*){r}#9481, concat(x, "01"){r}#9483, x{r}#9486]] + * \_Eval[[CONCAT(x{r}#9486,01[KEYWORD]) AS concat(x, "01")#9483]] + * \_Aggregate[[x{r}#9486],[COUNT(*[KEYWORD],true[BOOLEAN]) AS count(*)#9481, x{r}#9486]] + * \_StubRelation[[_meta_field{f}#9493, emp_no{f}#9487, first_name{f}#9488, gender{f}#9489, hire_date{f}#9494, job{f}#9495, job. + * raw{f}#9496, languages{f}#9490, last_name{f}#9491, long_noidx{f}#9497, salary{f}#9492, x{r}#9486]] + */ + public void testInlineStatsCountConcatGroupingByDateFormat() { + var plan = plan(""" + from test | inline stats count(*), concat(x, "01") by x = date_format("y-MM-dd", hire_date) + """); + + var limit = as(plan, Limit.class); + var inlineJoin = as(limit.child(), InlineJoin.class); + + // Verify the left side (eval with date_format) + var eval = as(inlineJoin.left(), Eval.class); + var relation = as(eval.child(), EsRelation.class); + + // Verify the right side (project with aggregates) + var project = as(inlineJoin.right(), Project.class); + var evalConcat = as(project.child(), Eval.class); + var aggregate = as(evalConcat.child(), Aggregate.class); + var stubRelation = as(aggregate.child(), StubRelation.class); + } + + /** + * Limit[1000[INTEGER],false] + * \_Aggregate[[x{r}#961],[MAX(x{r}#961,true[BOOLEAN]) AS a#964, x{r}#961]] + * \_Eval[[DATEFORMAT(y-MM-dd[KEYWORD],hire_date{f}#973) AS x#961]] + * \_EsRelation[test][_meta_field{f}#972, emp_no{f}#966, first_name{f}#96..] + */ + public void testStatsMaxGroupingByDateFormat() { + var plan = plan(""" + from test | stats a = max(x) by x = date_format("y-MM-dd", hire_date) + """); + + var limit = as(plan, Limit.class); + var aggregate = as(limit.child(), Aggregate.class); + var eval = as(aggregate.child(), Eval.class); + var relation = as(eval.child(), EsRelation.class); + } + + /** + * + *Limit[1000[INTEGER],false] + * \_InlineJoin[LEFT,[x{r}#4757],[x{r}#4757]] + * |_Eval[[DATEFORMAT(y-MM-dd[KEYWORD],hire_date{f}#4765) AS x#4757]] + * | \_EsRelation[test][_meta_field{f}#4764, emp_no{f}#4758, first_name{f}#..] + * \_Aggregate[[x{r}#4757],[MAX(x{r}#4757,true[BOOLEAN]) AS a#4754, x{r}#4757]] + * \_StubRelation[[_meta_field{f}#4764, emp_no{f}#4758, first_name{f}#4759, gender{f}#4760, hire_date{f}#4765, job{f}#4766, job. + * raw{f}#4767, languages{f}#4761, last_name{f}#4762, long_noidx{f}#4768, salary{f}#4763, x{r}#4757]] + */ + public void testInlineStatsMaxGroupingByDateFormat() { + var plan = plan(""" + from test | inline stats a = max(x) by x = date_format("y-MM-dd", hire_date) + """); + + var limit = as(plan, Limit.class); + var inlineJoin = as(limit.child(), InlineJoin.class); + + // Verify the left side (eval with date_format) + var eval = as(inlineJoin.left(), Eval.class); + var relation = as(eval.child(), EsRelation.class); + + // Verify eval contains date_format + assertThat(eval.fields(), hasSize(1)); + var evalAlias = as(eval.fields().get(0), Alias.class); + assertThat(evalAlias.child(), instanceOf(DateFormat.class)); + + // Verify the right side (aggregate) + var aggregate = as(inlineJoin.right(), Aggregate.class); + var stubRelation = as(aggregate.child(), StubRelation.class); + } + public void testPushDownConjunctionsToKnnPrefilter() { var query = """ from types