Skip to content

Commit afa1009

Browse files
committed
Merge updates for chrono & jiff DateTimeParsers and benchmarks. Overall jiff seems slightly faster in the case where parsing has no errors but slower when using multiple formats (higher cost for Error handling).
1 parent 5b719a8 commit afa1009

File tree

6 files changed

+128
-257
lines changed

6 files changed

+128
-257
lines changed

Cargo.lock

Lines changed: 32 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

datafusion/functions/benches/to_timestamp.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,12 +24,12 @@ use arrow::array::builder::StringBuilder;
2424
use arrow::array::{Array, ArrayRef, StringArray};
2525
use arrow::compute::cast;
2626
use arrow::datatypes::{DataType, Field, TimeUnit};
27-
use criterion::{criterion_group, criterion_main, Criterion};
27+
use criterion::{Criterion, criterion_group, criterion_main};
2828
use datafusion_common::config::ConfigOptions;
2929
use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
30+
use datafusion_functions::datetime::parser::DateTimeParser;
3031
use datafusion_functions::datetime::parser::chrono::ChronoDateTimeParser;
3132
use datafusion_functions::datetime::parser::jiff::JiffDateTimeParser;
32-
use datafusion_functions::datetime::parser::DateTimeParser;
3333
use datafusion_functions::datetime::to_timestamp;
3434
use itertools::izip;
3535

datafusion/functions/src/datetime/common.rs

Lines changed: 32 additions & 214 deletions
Original file line numberDiff line numberDiff line change
@@ -15,59 +15,21 @@
1515
// specific language governing permissions and limitations
1616
// under the License.
1717

18-
use std::sync::{Arc, LazyLock};
18+
use std::sync::Arc;
1919

20-
use arrow::array::timezone::Tz;
2120
use arrow::array::{
2221
Array, ArrowPrimitiveType, AsArray, GenericStringArray, PrimitiveArray,
2322
StringArrayType, StringViewArray,
2423
};
2524
use arrow::compute::DecimalCast;
26-
use arrow::compute::kernels::cast_utils::string_to_datetime;
2725
use arrow::datatypes::{DataType, TimeUnit};
2826
use arrow_buffer::ArrowNativeType;
29-
use chrono::LocalResult::Single;
30-
use chrono::format::{Parsed, StrftimeItems, parse};
31-
use chrono::{DateTime, TimeZone, Utc};
3227
use datafusion_common::cast::as_generic_string_array;
3328
use datafusion_common::{
34-
DataFusionError, Result, ScalarValue, exec_datafusion_err, exec_err,
35-
internal_datafusion_err, unwrap_or_internal_err,
29+
Result, ScalarValue, exec_err, internal_datafusion_err, unwrap_or_internal_err,
3630
};
3731
use datafusion_expr::ColumnarValue;
3832

39-
/// Error message if nanosecond conversion request beyond supported interval
40-
const ERR_NANOSECONDS_NOT_SUPPORTED: &str = "The dates that can be represented as nanoseconds have to be between 1677-09-21T00:12:44.0 and 2262-04-11T23:47:16.854775804";
41-
42-
static UTC: LazyLock<Tz> = LazyLock::new(|| "UTC".parse().expect("UTC is always valid"));
43-
44-
/// Converts a string representation of a date‑time into a timestamp expressed in
45-
/// nanoseconds since the Unix epoch.
46-
///
47-
/// This helper is a thin wrapper around the more general `string_to_datetime`
48-
/// function. It accepts an optional `timezone` which, if `None`, defaults to
49-
/// Coordinated Universal Time (UTC). The string `s` must contain a valid
50-
/// date‑time format that can be parsed by the underlying chrono parser.
51-
///
52-
/// # Return Value
53-
///
54-
/// * `Ok(i64)` – The number of nanoseconds since `1970‑01‑01T00:00:00Z`.
55-
/// * `Err(DataFusionError)` – If the string cannot be parsed, the parsed
56-
/// value is out of range (between 1677-09-21T00:12:44.0 and 2262-04-11T23:47:16.854775804)
57-
/// or the parsed value does not correspond to an unambiguous time.
58-
pub(crate) fn string_to_timestamp_nanos_with_timezone(
59-
timezone: &Option<Tz>,
60-
s: &str,
61-
) -> Result<i64> {
62-
let tz = timezone.as_ref().unwrap_or(&UTC);
63-
let dt = string_to_datetime(tz, s)?;
64-
let parsed = dt
65-
.timestamp_nanos_opt()
66-
.ok_or_else(|| exec_datafusion_err!("{ERR_NANOSECONDS_NOT_SUPPORTED}"))?;
67-
68-
Ok(parsed)
69-
}
70-
7133
/// Checks that all the arguments from the second are of type [Utf8], [LargeUtf8] or [Utf8View]
7234
///
7335
/// [Utf8]: DataType::Utf8
@@ -92,161 +54,6 @@ pub(crate) fn validate_data_types(args: &[ColumnarValue], name: &str) -> Result<
9254
Ok(())
9355
}
9456

95-
/// Accepts a string and parses it using the [`chrono::format::strftime`] specifiers
96-
/// relative to the provided `timezone`
97-
///
98-
/// If a timestamp is ambiguous, for example as a result of daylight-savings time, an error
99-
/// will be returned
100-
///
101-
/// Note that parsing [IANA timezones] is not supported yet in chrono - <https://github.com/chronotope/chrono/issues/38>
102-
/// and this implementation only supports named timezones at the end of the string preceded by a space.
103-
///
104-
/// [`chrono::format::strftime`]: https://docs.rs/chrono/latest/chrono/format/strftime/index.html
105-
/// [IANA timezones]: https://www.iana.org/time-zones
106-
pub(crate) fn string_to_datetime_formatted<T: TimeZone>(
107-
timezone: &T,
108-
s: &str,
109-
format: &str,
110-
) -> Result<DateTime<T>, DataFusionError> {
111-
let err = |err_ctx: &str| {
112-
exec_datafusion_err!(
113-
"Error parsing timestamp from '{s}' using format '{format}': {err_ctx}"
114-
)
115-
};
116-
117-
let mut datetime_str = s;
118-
let mut format = format;
119-
120-
// Manually handle the most common case of a named timezone at the end of the timestamp.
121-
// Note that %+ handles 'Z' at the end of the string without a space. This code doesn't
122-
// handle named timezones with no preceding space since that would require writing a
123-
// custom parser (or switching to Jiff)
124-
let tz: Option<chrono_tz::Tz> = if format.trim_end().ends_with(" %Z") {
125-
// grab the string after the last space as the named timezone
126-
if let Some((dt_str, timezone_name)) = datetime_str.trim_end().rsplit_once(' ') {
127-
datetime_str = dt_str;
128-
129-
// attempt to parse the timezone name
130-
let result: Result<chrono_tz::Tz, chrono_tz::ParseError> =
131-
timezone_name.parse();
132-
let Ok(tz) = result else {
133-
return Err(err(&result.unwrap_err().to_string()));
134-
};
135-
136-
// successfully parsed the timezone name, remove the ' %Z' from the format
137-
format = &format[..format.len() - 3];
138-
139-
Some(tz)
140-
} else {
141-
None
142-
}
143-
} else if format.contains("%Z") {
144-
return Err(err(
145-
"'%Z' is only supported at the end of the format string preceded by a space",
146-
));
147-
} else {
148-
None
149-
};
150-
151-
let mut parsed = Parsed::new();
152-
parse(&mut parsed, datetime_str, StrftimeItems::new(format))
153-
.map_err(|e| err(&e.to_string()))?;
154-
155-
let dt = match tz {
156-
Some(tz) => {
157-
// A timezone was manually parsed out, convert it to a fixed offset
158-
match parsed.to_datetime_with_timezone(&tz) {
159-
Ok(dt) => Ok(dt.fixed_offset()),
160-
Err(e) => Err(e),
161-
}
162-
}
163-
// default to parse the string assuming it has a timezone
164-
None => parsed.to_datetime(),
165-
};
166-
167-
if let Err(e) = &dt {
168-
// no timezone or other failure, try without a timezone
169-
let ndt = parsed
170-
.to_naive_datetime_with_offset(0)
171-
.or_else(|_| parsed.to_naive_date().map(|nd| nd.into()));
172-
if let Err(e) = &ndt {
173-
return Err(err(&e.to_string()));
174-
}
175-
176-
if let Single(e) = &timezone.from_local_datetime(&ndt.unwrap()) {
177-
Ok(e.to_owned())
178-
} else {
179-
Err(err(&e.to_string()))
180-
}
181-
} else {
182-
Ok(dt.unwrap().with_timezone(timezone))
183-
}
184-
}
185-
186-
/// Accepts a string with a `chrono` format and converts it to a
187-
/// nanosecond precision timestamp relative to the provided `timezone`.
188-
///
189-
/// See [`chrono::format::strftime`] for the full set of supported formats.
190-
///
191-
/// Implements the `to_timestamp` function to convert a string to a
192-
/// timestamp, following the model of spark SQL’s to_`timestamp`.
193-
///
194-
/// Internally, this function uses the `chrono` library for the
195-
/// datetime parsing
196-
///
197-
/// ## Timestamp Precision
198-
///
199-
/// Function uses the maximum precision timestamps supported by
200-
/// Arrow (nanoseconds stored as a 64-bit integer) timestamps. This
201-
/// means the range of dates that timestamps can represent is ~1677 AD
202-
/// to 2262 AM
203-
///
204-
/// ## Timezone / Offset Handling
205-
///
206-
/// Numerical values of timestamps are stored compared to offset UTC.
207-
///
208-
/// Any timestamp in the formatting string is handled according to the rules
209-
/// defined by `chrono`.
210-
///
211-
/// [`chrono::format::strftime`]: https://docs.rs/chrono/latest/chrono/format/strftime/index.html
212-
#[inline]
213-
pub(crate) fn string_to_timestamp_nanos_formatted_with_timezone(
214-
timezone: &Option<Tz>,
215-
s: &str,
216-
format: &str,
217-
) -> Result<i64, DataFusionError> {
218-
let dt = string_to_datetime_formatted(timezone.as_ref().unwrap_or(&UTC), s, format)?;
219-
let parsed = dt
220-
.timestamp_nanos_opt()
221-
.ok_or_else(|| exec_datafusion_err!("{ERR_NANOSECONDS_NOT_SUPPORTED}"))?;
222-
223-
Ok(parsed)
224-
}
225-
226-
/// Accepts a string with a `chrono` format and converts it to a
227-
/// millisecond precision timestamp relative to the provided `timezone`.
228-
///
229-
/// See [`chrono::format::strftime`] for the full set of supported formats.
230-
///
231-
/// Internally, this function uses the `chrono` library for the
232-
/// datetime parsing
233-
///
234-
/// ## Timezone / Offset Handling
235-
///
236-
/// Numerical values of timestamps are stored compared to offset UTC.
237-
///
238-
/// Any timestamp in the formatting string is handled according to the rules
239-
/// defined by `chrono`.
240-
///
241-
/// [`chrono::format::strftime`]: https://docs.rs/chrono/latest/chrono/format/strftime/index.html
242-
#[inline]
243-
pub(crate) fn string_to_timestamp_millis_formatted(s: &str, format: &str) -> Result<i64> {
244-
Ok(string_to_datetime_formatted(&Utc, s, format)?
245-
.naive_utc()
246-
.and_utc()
247-
.timestamp_millis())
248-
}
249-
25057
pub(crate) fn handle<O, F>(
25158
args: &[ColumnarValue],
25259
op: F,
@@ -306,7 +113,7 @@ pub(crate) fn handle_multiple<O, F, M>(
306113
) -> Result<ColumnarValue>
307114
where
308115
O: ArrowPrimitiveType,
309-
F: Fn(&str, &str) -> Result<O::Native>,
116+
F: Fn(&str, &[&str]) -> Result<O::Native>,
310117
M: Fn(O::Native) -> O::Native,
311118
{
312119
match &args[0] {
@@ -372,7 +179,7 @@ where
372179
};
373180

374181
if let Some(s) = x {
375-
match op(a, s.as_str()) {
182+
match op(a, &[s.as_str()]) {
376183
Ok(r) => {
377184
let result = op2(r).to_i64();
378185
let s = scalar_value(dt, result)?;
@@ -411,7 +218,7 @@ pub(crate) fn strings_to_primitive_function<O, F, F2>(
411218
) -> Result<PrimitiveArray<O>>
412219
where
413220
O: ArrowPrimitiveType,
414-
F: Fn(&str, &str) -> Result<O::Native>,
221+
F: Fn(&str, &[&str]) -> Result<O::Native>,
415222
F2: Fn(O::Native) -> O::Native,
416223
{
417224
if args.len() < 2 {
@@ -472,7 +279,7 @@ fn handle_array_op<'a, O, V, F, F2>(
472279
where
473280
V: StringArrayType<'a>,
474281
O: ArrowPrimitiveType,
475-
F: Fn(&str, &str) -> Result<O::Native>,
282+
F: Fn(&str, &[&str]) -> Result<O::Native>,
476283
F2: Fn(O::Native) -> O::Native,
477284
{
478285
first
@@ -481,28 +288,39 @@ where
481288
.map(|(pos, x)| {
482289
let mut val = None;
483290
if let Some(x) = x {
291+
let mut v = vec![];
292+
484293
for arg in args {
485-
let v = match arg {
294+
match arg {
486295
ColumnarValue::Array(a) => match a.data_type() {
487-
DataType::Utf8View => Ok(a.as_string_view().value(pos)),
488-
DataType::LargeUtf8 => Ok(a.as_string::<i64>().value(pos)),
489-
DataType::Utf8 => Ok(a.as_string::<i32>().value(pos)),
490-
other => exec_err!("Unexpected type encountered '{other}'"),
296+
DataType::Utf8View => v.push(a.as_string_view().value(pos)),
297+
DataType::LargeUtf8 => {
298+
v.push(a.as_string::<i64>().value(pos))
299+
}
300+
DataType::Utf8 => v.push(a.as_string::<i32>().value(pos)),
301+
other => {
302+
return exec_err!(
303+
"Unexpected type encountered '{other}'"
304+
);
305+
}
491306
},
492307
ColumnarValue::Scalar(s) => match s.try_as_str() {
493-
Some(Some(v)) => Ok(v),
308+
Some(Some(s)) => v.push(s),
494309
Some(None) => continue, // null string
495-
None => exec_err!("Unexpected scalar type encountered '{s}'"),
310+
None => {
311+
return exec_err!(
312+
"Unexpected scalar type encountered '{s}'"
313+
);
314+
}
496315
},
497-
}?;
316+
};
317+
}
498318

499-
let r = op(x, v);
500-
if let Ok(inner) = r {
501-
val = Some(Ok(op2(inner)));
502-
break;
503-
} else {
504-
val = Some(r);
505-
}
319+
let r = op(x, &v);
320+
if let Ok(inner) = r {
321+
val = Some(Ok(op2(inner)));
322+
} else {
323+
val = Some(r);
506324
}
507325
};
508326

0 commit comments

Comments
 (0)