From 288b7b73939302a45e5d043cf5c57a3044faef5f Mon Sep 17 00:00:00 2001 From: Ruihang Xia Date: Wed, 23 Jul 2025 06:45:43 +0800 Subject: [PATCH 1/4] speedup (~7x faster in some cases) Signed-off-by: Ruihang Xia --- .../functions/src/datetime/date_trunc.rs | 64 ++++++++++++++++++- 1 file changed, 63 insertions(+), 1 deletion(-) diff --git a/datafusion/functions/src/datetime/date_trunc.rs b/datafusion/functions/src/datetime/date_trunc.rs index 8963ef77a53b..856ec489ed98 100644 --- a/datafusion/functions/src/datetime/date_trunc.rs +++ b/datafusion/functions/src/datetime/date_trunc.rs @@ -28,7 +28,7 @@ use arrow::array::types::{ ArrowTimestampType, TimestampMicrosecondType, TimestampMillisecondType, TimestampNanosecondType, TimestampSecondType, }; -use arrow::array::{Array, PrimitiveArray}; +use arrow::array::{Array, ArrayRef, Int64Array, PrimitiveArray}; use arrow::datatypes::DataType::{self, Null, Timestamp, Utf8, Utf8View}; use arrow::datatypes::TimeUnit::{self, Microsecond, Millisecond, Nanosecond, Second}; use datafusion_common::cast::as_primitive_array; @@ -60,6 +60,8 @@ use chrono::{ - hour / HOUR - minute / MINUTE - second / SECOND + - millisecond / MILLISECOND + - microsecond / MICROSECOND "# ), argument( @@ -185,6 +187,21 @@ impl ScalarUDFImpl for DateTruncFunc { ) -> Result { let parsed_tz = parse_tz(tz_opt)?; let array = as_primitive_array::(array)?; + + // fast path for fine granularities + if matches!( + granularity.as_str(), + "second" | "minute" | "millisecond" | "microsecond" + ) || (parsed_tz.is_none() && granularity.as_str() == "hour") + { + let result = general_date_trunc_array_fine_granularity( + T::UNIT, + array, + granularity.as_str(), + )?; + return Ok(ColumnarValue::Array(result)); + } + let array: PrimitiveArray = array .try_unary(|x| { general_date_trunc(T::UNIT, x, parsed_tz, granularity.as_str()) @@ -423,6 +440,51 @@ fn date_trunc_coarse(granularity: &str, value: i64, tz: Option) -> Result( + tu: TimeUnit, + array: &PrimitiveArray, + granularity: &str, +) -> Result { + let unit = match (tu, granularity) { + (Second, "minute") => Some(Int64Array::new_scalar(60)), + (Second, "hour") => Some(Int64Array::new_scalar(3600)), + + (Millisecond, "second") => Some(Int64Array::new_scalar(1_000)), + (Millisecond, "minute") => Some(Int64Array::new_scalar(60_000)), + (Millisecond, "hour") => Some(Int64Array::new_scalar(3_600_000)), + + (Microsecond, "millisecond") => Some(Int64Array::new_scalar(1_000)), + (Microsecond, "second") => Some(Int64Array::new_scalar(1_000_000)), + (Microsecond, "minute") => Some(Int64Array::new_scalar(60_000_000)), + (Microsecond, "hour") => Some(Int64Array::new_scalar(3_600_000_000)), + + (Nanosecond, "microsecond") => Some(Int64Array::new_scalar(1_000)), + (Nanosecond, "millisecond") => Some(Int64Array::new_scalar(1_000_000)), + (Nanosecond, "second") => Some(Int64Array::new_scalar(1_000_000_000)), + (Nanosecond, "minute") => Some(Int64Array::new_scalar(60_000_000_000)), + (Nanosecond, "hour") => Some(Int64Array::new_scalar(3_600_000_000_000)), + _ => None, + }; + + if let Some(unit) = unit { + let original_type = array.data_type(); + let array = arrow::compute::cast(array, &DataType::Int64)?; + let array = arrow::compute::kernels::numeric::div(&array, &unit)?; + let array = arrow::compute::kernels::numeric::mul(&array, &unit)?; + let array = arrow::compute::cast(&array, original_type)?; + Ok(array) + } else { + // truncate to the same or smaller unit + Ok(Arc::new(array.clone())) + } +} + // truncates a single value with the given timeunit to the specified granularity fn general_date_trunc( tu: TimeUnit, From 6b9de5bae3cb9e7a12ae1bc1b5e006afa0d03b70 Mon Sep 17 00:00:00 2001 From: Ruihang Xia Date: Wed, 23 Jul 2025 07:30:38 +0800 Subject: [PATCH 2/4] update function document Signed-off-by: Ruihang Xia --- docs/source/user-guide/sql/scalar_functions.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/source/user-guide/sql/scalar_functions.md b/docs/source/user-guide/sql/scalar_functions.md index 7b4bb71d1c59..d49fc22dabb4 100644 --- a/docs/source/user-guide/sql/scalar_functions.md +++ b/docs/source/user-guide/sql/scalar_functions.md @@ -2150,6 +2150,8 @@ date_trunc(precision, expression) - hour / HOUR - minute / MINUTE - second / SECOND + - millisecond / MILLISECOND + - microsecond / MICROSECOND - **expression**: Time expression to operate on. Can be a constant, column, or function. From f50fb3fed397815287669421c450e42007797bac Mon Sep 17 00:00:00 2001 From: Ruihang Xia Date: Thu, 24 Jul 2025 01:50:33 +0800 Subject: [PATCH 3/4] add a case for Asia/Kathmandu Signed-off-by: Ruihang Xia --- datafusion/functions/src/datetime/date_trunc.rs | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/datafusion/functions/src/datetime/date_trunc.rs b/datafusion/functions/src/datetime/date_trunc.rs index 856ec489ed98..46f01ebee8d9 100644 --- a/datafusion/functions/src/datetime/date_trunc.rs +++ b/datafusion/functions/src/datetime/date_trunc.rs @@ -946,6 +946,21 @@ mod tests { "2018-11-04T02:00:00-02", ], ), + ( + vec![ + "2024-10-26T23:30:00Z", + "2024-10-27T00:30:00Z", + "2024-10-27T01:30:00Z", + "2024-10-27T02:30:00Z", + ], + Some("Asia/Kathmandu".into()), // UTC+5:45 + vec![ + "2024-10-27T05:00:00+05:45", + "2024-10-27T06:00:00+05:45", + "2024-10-27T07:00:00+05:45", + "2024-10-27T08:00:00+05:45", + ], + ), ]; cases.iter().for_each(|(original, tz_opt, expected)| { From 346ae5952afe70be5f1ecd137507d9314503a4bd Mon Sep 17 00:00:00 2001 From: Ruihang Xia Date: Fri, 25 Jul 2025 05:27:24 +0800 Subject: [PATCH 4/4] add day to fast path and some comments Signed-off-by: Ruihang Xia --- datafusion/functions/src/datetime/date_trunc.rs | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/datafusion/functions/src/datetime/date_trunc.rs b/datafusion/functions/src/datetime/date_trunc.rs index 46f01ebee8d9..d3d52e237e15 100644 --- a/datafusion/functions/src/datetime/date_trunc.rs +++ b/datafusion/functions/src/datetime/date_trunc.rs @@ -191,8 +191,13 @@ impl ScalarUDFImpl for DateTruncFunc { // fast path for fine granularities if matches!( granularity.as_str(), + // For morden timezones, it's correct to truncate "minute" in this way. + // Both datafusion and arrow are ignoring historical timezone's non-minute granularity + // bias (e.g., Asia/Kathmandu before 1919 is UTC+05:41:16). "second" | "minute" | "millisecond" | "microsecond" - ) || (parsed_tz.is_none() && granularity.as_str() == "hour") + ) || + // In UTC, "hour" and "day" have uniform durations and can be truncated with simple arithmetic + (parsed_tz.is_none() && matches!(granularity.as_str(), "hour" | "day")) { let result = general_date_trunc_array_fine_granularity( T::UNIT, @@ -454,21 +459,25 @@ fn general_date_trunc_array_fine_granularity( let unit = match (tu, granularity) { (Second, "minute") => Some(Int64Array::new_scalar(60)), (Second, "hour") => Some(Int64Array::new_scalar(3600)), + (Second, "day") => Some(Int64Array::new_scalar(86400)), (Millisecond, "second") => Some(Int64Array::new_scalar(1_000)), (Millisecond, "minute") => Some(Int64Array::new_scalar(60_000)), (Millisecond, "hour") => Some(Int64Array::new_scalar(3_600_000)), + (Millisecond, "day") => Some(Int64Array::new_scalar(86_400_000)), (Microsecond, "millisecond") => Some(Int64Array::new_scalar(1_000)), (Microsecond, "second") => Some(Int64Array::new_scalar(1_000_000)), (Microsecond, "minute") => Some(Int64Array::new_scalar(60_000_000)), (Microsecond, "hour") => Some(Int64Array::new_scalar(3_600_000_000)), + (Microsecond, "day") => Some(Int64Array::new_scalar(86_400_000_000)), (Nanosecond, "microsecond") => Some(Int64Array::new_scalar(1_000)), (Nanosecond, "millisecond") => Some(Int64Array::new_scalar(1_000_000)), (Nanosecond, "second") => Some(Int64Array::new_scalar(1_000_000_000)), (Nanosecond, "minute") => Some(Int64Array::new_scalar(60_000_000_000)), (Nanosecond, "hour") => Some(Int64Array::new_scalar(3_600_000_000_000)), + (Nanosecond, "day") => Some(Int64Array::new_scalar(86_400_000_000_000)), _ => None, };