-
Notifications
You must be signed in to change notification settings - Fork 2k
Respect execution timezone in to_timestamp and related functions
#18025
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 3 commits
8400b93
95a21ff
ac370e7
74adc27
7bf5cb3
09f6646
6d54f88
11f0ed2
775f11f
e96a42b
d73c571
48ddfba
52cd505
ea47fa6
b741b25
e2887e0
90bba02
8d9c52f
5bdabd3
ce3d135
667ada9
9ea5a68
5ffa4dd
c666c79
a1ba817
556e438
28d3b9b
9c00bf8
f66345e
764dfd3
de19df5
d9b3d4b
b4bb002
e61918f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -15,8 +15,10 @@ | |
| // specific language governing permissions and limitations | ||
| // under the License. | ||
|
|
||
| use std::str::FromStr; | ||
| use std::sync::Arc; | ||
|
|
||
| use arrow::array::timezone::Tz; | ||
| use arrow::array::{ | ||
| Array, ArrowPrimitiveType, AsArray, GenericStringArray, PrimitiveArray, | ||
| StringArrayType, StringViewArray, | ||
|
|
@@ -25,7 +27,7 @@ use arrow::compute::kernels::cast_utils::string_to_timestamp_nanos; | |
| use arrow::datatypes::DataType; | ||
| use chrono::format::{parse, Parsed, StrftimeItems}; | ||
| use chrono::LocalResult::Single; | ||
| use chrono::{DateTime, TimeZone, Utc}; | ||
| use chrono::{DateTime, FixedOffset, LocalResult, NaiveDateTime, TimeZone, Utc}; | ||
|
|
||
| use datafusion_common::cast::as_generic_string_array; | ||
| use datafusion_common::{ | ||
|
|
@@ -42,6 +44,167 @@ pub(crate) fn string_to_timestamp_nanos_shim(s: &str) -> Result<i64> { | |
| string_to_timestamp_nanos(s).map_err(|e| e.into()) | ||
| } | ||
|
|
||
| #[derive(Clone, Copy)] | ||
| enum ConfiguredZone { | ||
| Named(Tz), | ||
| Offset(FixedOffset), | ||
| } | ||
|
|
||
| #[derive(Clone)] | ||
| pub(crate) struct ConfiguredTimeZone { | ||
| repr: Arc<str>, | ||
| zone: ConfiguredZone, | ||
| } | ||
|
|
||
| impl ConfiguredTimeZone { | ||
| pub(crate) fn utc() -> Self { | ||
| Self { | ||
| repr: Arc::from("+00:00"), | ||
| zone: ConfiguredZone::Offset(FixedOffset::east_opt(0).unwrap()), | ||
| } | ||
| } | ||
|
|
||
| pub(crate) fn parse(tz: &str) -> Result<Self> { | ||
| if tz.trim().is_empty() { | ||
| return Ok(Self::utc()); | ||
| } | ||
|
|
||
| if let Ok(named) = Tz::from_str(tz) { | ||
| return Ok(Self { | ||
| repr: Arc::from(tz), | ||
| zone: ConfiguredZone::Named(named), | ||
| }); | ||
| } | ||
|
|
||
| if let Some(offset) = parse_fixed_offset(tz) { | ||
| return Ok(Self { | ||
| repr: Arc::from(tz), | ||
| zone: ConfiguredZone::Offset(offset), | ||
| }); | ||
| } | ||
|
|
||
| Err(exec_datafusion_err!( | ||
| "Invalid execution timezone '{tz}'. Please provide an IANA timezone name (e.g. 'America/New_York') or an offset in the form '+HH:MM'." | ||
| )) | ||
| } | ||
|
|
||
| fn timestamp_from_naive(&self, naive: &NaiveDateTime) -> Result<i64> { | ||
| match self.zone { | ||
| ConfiguredZone::Named(tz) => { | ||
| local_datetime_to_timestamp(tz.from_local_datetime(naive), &self.repr) | ||
| } | ||
| ConfiguredZone::Offset(offset) => { | ||
| local_datetime_to_timestamp(offset.from_local_datetime(naive), &self.repr) | ||
| } | ||
| } | ||
| } | ||
|
|
||
| fn datetime_from_formatted(&self, s: &str, format: &str) -> Result<DateTime<Utc>> { | ||
| let datetime = match self.zone { | ||
| ConfiguredZone::Named(tz) => { | ||
| string_to_datetime_formatted(&tz, s, format)?.with_timezone(&Utc) | ||
| } | ||
| ConfiguredZone::Offset(offset) => { | ||
| string_to_datetime_formatted(&offset, s, format)?.with_timezone(&Utc) | ||
| } | ||
| }; | ||
| Ok(datetime) | ||
| } | ||
| } | ||
|
|
||
| fn parse_fixed_offset(tz: &str) -> Option<FixedOffset> { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am unsure as for the need for this as I think it already exists in https://github.com/apache/arrow-rs/blob/751b0822a7f0b2647c1c662131131b35f268bfef/arrow-array/src/timezone.rs#L25. As well, the Tz::from_str I think may already handle named and offsets https://github.com/apache/arrow-rs/blob/751b0822a7f0b2647c1c662131131b35f268bfef/arrow-array/src/timezone.rs#L91
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I believe it is a private fn
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks, I'll file a ticket to make that pub. |
||
| let tz = tz.trim(); | ||
| if tz.eq_ignore_ascii_case("utc") || tz.eq_ignore_ascii_case("z") { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I double checked to confirm that it does not handle lower case:
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Interesting. While I expect it's to spec I can't see a reason why that impl shouldn't be case insensitive. I'll file a ticket for that as well. |
||
| return FixedOffset::east_opt(0); | ||
| } | ||
|
|
||
| let (sign, rest) = if let Some(rest) = tz.strip_prefix('+') { | ||
|
||
| (1, rest) | ||
| } else if let Some(rest) = tz.strip_prefix('-') { | ||
| (-1, rest) | ||
| } else { | ||
| return None; | ||
| }; | ||
|
|
||
| let (hours, minutes) = if let Some((hours, minutes)) = rest.split_once(':') { | ||
| (hours, minutes) | ||
| } else if rest.len() == 4 { | ||
| rest.split_at(2) | ||
| } else { | ||
| return None; | ||
| }; | ||
|
|
||
| let hours: i32 = hours.parse().ok()?; | ||
| let minutes: i32 = minutes.parse().ok()?; | ||
| if hours > 23 || minutes > 59 { | ||
| return None; | ||
| } | ||
|
|
||
| let total_minutes = hours * 60 + minutes; | ||
| let total_seconds = sign * total_minutes * 60; | ||
| FixedOffset::east_opt(total_seconds) | ||
| } | ||
|
|
||
| fn local_datetime_to_timestamp<T: TimeZone>( | ||
| result: LocalResult<DateTime<T>>, | ||
| tz_repr: &str, | ||
| ) -> Result<i64> { | ||
| match result { | ||
| Single(dt) => datetime_to_timestamp(dt.with_timezone(&Utc)), | ||
| LocalResult::Ambiguous(dt1, dt2) => Err(exec_datafusion_err!( | ||
| "The local time '{:?}' is ambiguous in timezone '{tz_repr}' (also corresponds to '{:?}').", | ||
| dt1.naive_local(), | ||
| dt2.naive_local() | ||
| )), | ||
| LocalResult::None => Err(exec_datafusion_err!( | ||
| "The local time is invalid in timezone '{tz_repr}'." | ||
| )), | ||
| } | ||
| } | ||
|
|
||
| fn datetime_to_timestamp(datetime: DateTime<Utc>) -> Result<i64> { | ||
| datetime | ||
| .timestamp_nanos_opt() | ||
| .ok_or_else(|| exec_datafusion_err!("{ERR_NANOSECONDS_NOT_SUPPORTED}")) | ||
| } | ||
|
|
||
| fn timestamp_to_naive(value: i64) -> Result<NaiveDateTime> { | ||
| let secs = value.div_euclid(1_000_000_000); | ||
| let nanos = value.rem_euclid(1_000_000_000) as u32; | ||
| DateTime::<Utc>::from_timestamp(secs, nanos) | ||
| .ok_or_else(|| exec_datafusion_err!("{ERR_NANOSECONDS_NOT_SUPPORTED}")) | ||
| .map(|dt| dt.naive_utc()) | ||
| } | ||
|
|
||
| fn has_explicit_timezone(value: &str) -> bool { | ||
| if DateTime::parse_from_rfc3339(value).is_ok() { | ||
| return true; | ||
| } | ||
|
|
||
| if let Some(pos) = value.rfind(|c| ['T', ' '].contains(&c)) { | ||
| let tail = &value[pos + 1..]; | ||
| tail.contains('Z') | ||
| || tail.contains('z') | ||
| || tail.contains('+') | ||
| || tail.contains('-') | ||
| } else { | ||
| false | ||
| } | ||
| } | ||
|
|
||
| pub(crate) fn string_to_timestamp_nanos_with_timezone( | ||
| timezone: &ConfiguredTimeZone, | ||
| s: &str, | ||
| ) -> Result<i64> { | ||
| let ts = string_to_timestamp_nanos_shim(s)?; | ||
| if has_explicit_timezone(s) { | ||
| Ok(ts) | ||
| } else { | ||
| let naive = timestamp_to_naive(ts)?; | ||
| timezone.timestamp_from_naive(&naive) | ||
| } | ||
| } | ||
|
|
||
| /// Checks that all the arguments from the second are of type [Utf8], [LargeUtf8] or [Utf8View] | ||
| /// | ||
| /// [Utf8]: DataType::Utf8 | ||
|
|
@@ -142,6 +305,7 @@ pub(crate) fn string_to_datetime_formatted<T: TimeZone>( | |
| /// [`chrono::format::strftime`]: https://docs.rs/chrono/latest/chrono/format/strftime/index.html | ||
| /// | ||
| #[inline] | ||
| #[allow(dead_code)] | ||
|
||
| pub(crate) fn string_to_timestamp_nanos_formatted( | ||
| s: &str, | ||
| format: &str, | ||
|
|
@@ -153,6 +317,15 @@ pub(crate) fn string_to_timestamp_nanos_formatted( | |
| .ok_or_else(|| exec_datafusion_err!("{ERR_NANOSECONDS_NOT_SUPPORTED}")) | ||
| } | ||
|
|
||
| pub(crate) fn string_to_timestamp_nanos_formatted_with_timezone( | ||
| timezone: &ConfiguredTimeZone, | ||
| s: &str, | ||
| format: &str, | ||
| ) -> Result<i64, DataFusionError> { | ||
| let datetime = timezone.datetime_from_formatted(s, format)?; | ||
| datetime_to_timestamp(datetime) | ||
| } | ||
|
|
||
| /// Accepts a string with a `chrono` format and converts it to a | ||
| /// millisecond precision timestamp. | ||
| /// | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Note that we may want to change this to allow for None - see the start of discussion @ #17993 (comment) and #18017 (comment)