Skip to content

Commit 0a6fda5

Browse files
neilconwayalamb
authored andcommitted
perf: Optimize to_char to allocate less, fix NULL handling (apache#20635)
## Which issue does this PR close? - Closes apache#20634. ## Rationale for this change The current `to_char` implementation (both scalar and array paths) allocates a new string for every input row to hold the result of the `ArrayFormatter`. To produce the results, it uses `StringArray::from`, which copies. We can do better by using a reusable buffer to store the result of `ArrayFormatter`, and then append to `StringBuilder`. We can then construct the final result values from the `StringBuilder` very efficiently. This yields a 10-22% improvement on the `to_char` microbenchmarks. In the scalar path, we can do a bit better by having the `ArrayFormatter` write directly into the `StringBuilder`'s buffer, which saves a copy. In the array path, we can't easily do this, because `to_char` has some (dubious) logic to retry errors on `Date32` inputs by casting to `Date64` and retrying the ArrayFormatter. Since the `StringBuilder` is shared between all rows and there's no easy way to remove the partial write that might happen in the case of an error, we use the intermediate buffer here instead. This PR also cleans up various code in `to_char`, and fixes a bug in `NULL` handling: in the array case, if the current data value is NULL but the format string is non-NULL, we incorrectly returned an empty string instead of NULL. ## What changes are included in this PR? * Optimize `to_char` (scalar and array paths) as described above * Fix bug in NULL handling * Add SLT test case for NULL handling bug * Simplify and refactor various parts of `to_char`, particularly around error handling * Revise `to_char` benchmarks: the previous version tested `to_char` for `Date32` inputs where the format string specifies a time value, which is an odd corner case. Also get rid of benchmarking the scalar-scalar case; this is very fast and occurs rarely in practice (will usually be constant-folded before query execution). ## Are these changes tested? Yes. Benchmarked and added new test. ## Are there any user-facing changes? No.
1 parent 7698fdc commit 0a6fda5

File tree

3 files changed

+130
-164
lines changed

3 files changed

+130
-164
lines changed

datafusion/functions/benches/to_char.rs

Lines changed: 24 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -18,13 +18,12 @@
1818
use std::hint::black_box;
1919
use std::sync::Arc;
2020

21-
use arrow::array::{ArrayRef, Date32Array, StringArray};
21+
use arrow::array::{ArrayRef, Date32Array, Date64Array, StringArray};
2222
use arrow::datatypes::{DataType, Field};
2323
use chrono::TimeDelta;
2424
use chrono::prelude::*;
2525
use criterion::{Criterion, criterion_group, criterion_main};
2626
use datafusion_common::ScalarValue;
27-
use datafusion_common::ScalarValue::TimestampNanosecond;
2827
use datafusion_common::config::ConfigOptions;
2928
use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
3029
use datafusion_functions::datetime::to_char;
@@ -63,6 +62,26 @@ fn generate_date32_array(rng: &mut ThreadRng) -> Date32Array {
6362
Date32Array::from(data)
6463
}
6564

65+
fn generate_date64_array(rng: &mut ThreadRng) -> Date64Array {
66+
let start_date = "1970-01-01"
67+
.parse::<NaiveDate>()
68+
.expect("Date should parse");
69+
let end_date = "2050-12-31"
70+
.parse::<NaiveDate>()
71+
.expect("Date should parse");
72+
let mut data: Vec<i64> = Vec::with_capacity(1000);
73+
for _ in 0..1000 {
74+
let date = pick_date_in_range(rng, start_date, end_date);
75+
let millis = date
76+
.and_hms_opt(0, 0, 0)
77+
.unwrap()
78+
.and_utc()
79+
.timestamp_millis();
80+
data.push(millis);
81+
}
82+
Date64Array::from(data)
83+
}
84+
6685
const DATE_PATTERNS: [&str; 5] =
6786
["%Y:%m:%d", "%d-%m-%Y", "%d%m%Y", "%Y%m%d", "%Y...%m...%d"];
6887

@@ -155,7 +174,7 @@ fn criterion_benchmark(c: &mut Criterion) {
155174

156175
c.bench_function("to_char_array_datetime_patterns_1000", |b| {
157176
let mut rng = rand::rng();
158-
let data_arr = generate_date32_array(&mut rng);
177+
let data_arr = generate_date64_array(&mut rng);
159178
let batch_len = data_arr.len();
160179
let data = ColumnarValue::Array(Arc::new(data_arr) as ArrayRef);
161180
let patterns = ColumnarValue::Array(Arc::new(generate_datetime_pattern_array(
@@ -182,7 +201,7 @@ fn criterion_benchmark(c: &mut Criterion) {
182201

183202
c.bench_function("to_char_array_mixed_patterns_1000", |b| {
184203
let mut rng = rand::rng();
185-
let data_arr = generate_date32_array(&mut rng);
204+
let data_arr = generate_date64_array(&mut rng);
186205
let batch_len = data_arr.len();
187206
let data = ColumnarValue::Array(Arc::new(data_arr) as ArrayRef);
188207
let patterns = ColumnarValue::Array(Arc::new(generate_mixed_pattern_array(
@@ -235,7 +254,7 @@ fn criterion_benchmark(c: &mut Criterion) {
235254

236255
c.bench_function("to_char_scalar_datetime_pattern_1000", |b| {
237256
let mut rng = rand::rng();
238-
let data_arr = generate_date32_array(&mut rng);
257+
let data_arr = generate_date64_array(&mut rng);
239258
let batch_len = data_arr.len();
240259
let data = ColumnarValue::Array(Arc::new(data_arr) as ArrayRef);
241260
let patterns = ColumnarValue::Scalar(ScalarValue::Utf8(Some(
@@ -259,38 +278,6 @@ fn criterion_benchmark(c: &mut Criterion) {
259278
)
260279
})
261280
});
262-
263-
c.bench_function("to_char_scalar_1000", |b| {
264-
let mut rng = rand::rng();
265-
let timestamp = "2026-07-08T09:10:11"
266-
.parse::<NaiveDateTime>()
267-
.unwrap()
268-
.with_nanosecond(56789)
269-
.unwrap()
270-
.and_utc()
271-
.timestamp_nanos_opt()
272-
.unwrap();
273-
let data = ColumnarValue::Scalar(TimestampNanosecond(Some(timestamp), None));
274-
let pattern =
275-
ColumnarValue::Scalar(ScalarValue::Utf8(Some(pick_date_pattern(&mut rng))));
276-
277-
b.iter(|| {
278-
black_box(
279-
to_char()
280-
.invoke_with_args(ScalarFunctionArgs {
281-
args: vec![data.clone(), pattern.clone()],
282-
arg_fields: vec![
283-
Field::new("a", data.data_type(), true).into(),
284-
Field::new("b", pattern.data_type(), true).into(),
285-
],
286-
number_rows: 1,
287-
return_field: Field::new("f", DataType::Utf8, true).into(),
288-
config_options: Arc::clone(&config_options),
289-
})
290-
.expect("to_char should work on valid values"),
291-
)
292-
})
293-
});
294281
}
295282

296283
criterion_group!(benches, criterion_benchmark);

datafusion/functions/src/datetime/to_char.rs

Lines changed: 89 additions & 125 deletions
Original file line numberDiff line numberDiff line change
@@ -18,15 +18,15 @@
1818
use std::any::Any;
1919
use std::sync::Arc;
2020

21+
use arrow::array::builder::StringBuilder;
2122
use arrow::array::cast::AsArray;
22-
use arrow::array::{Array, ArrayRef, StringArray, new_null_array};
23+
use arrow::array::{Array, ArrayRef};
2324
use arrow::compute::cast;
2425
use arrow::datatypes::DataType;
2526
use arrow::datatypes::DataType::{
2627
Date32, Date64, Duration, Time32, Time64, Timestamp, Utf8,
2728
};
2829
use arrow::datatypes::TimeUnit::{Microsecond, Millisecond, Nanosecond, Second};
29-
use arrow::error::ArrowError;
3030
use arrow::util::display::{ArrayFormatter, DurationFormat, FormatOptions};
3131
use datafusion_common::{Result, ScalarValue, exec_err, utils::take_function_args};
3232
use datafusion_expr::TypeSignature::Exact;
@@ -143,20 +143,17 @@ impl ScalarUDFImpl for ToCharFunc {
143143
let [date_time, format] = take_function_args(self.name(), &args)?;
144144

145145
match format {
146-
ColumnarValue::Scalar(ScalarValue::Utf8(None))
147-
| ColumnarValue::Scalar(ScalarValue::Null) => to_char_scalar(date_time, None),
148-
// constant format
149-
ColumnarValue::Scalar(ScalarValue::Utf8(Some(format))) => {
150-
// invoke to_char_scalar with the known string, without converting to array
151-
to_char_scalar(date_time, Some(format))
146+
ColumnarValue::Scalar(ScalarValue::Null | ScalarValue::Utf8(None)) => {
147+
Ok(ColumnarValue::Scalar(ScalarValue::Utf8(None)))
152148
}
153-
ColumnarValue::Array(_) => to_char_array(&args),
154-
_ => {
155-
exec_err!(
156-
"Format for `to_char` must be non-null Utf8, received {}",
157-
format.data_type()
158-
)
149+
ColumnarValue::Scalar(ScalarValue::Utf8(Some(fmt))) => {
150+
to_char_scalar(date_time, fmt)
159151
}
152+
ColumnarValue::Array(_) => to_char_array(&args),
153+
_ => exec_err!(
154+
"Format for `to_char` must be non-null Utf8, received {}",
155+
format.data_type()
156+
),
160157
}
161158
}
162159

@@ -171,11 +168,8 @@ impl ScalarUDFImpl for ToCharFunc {
171168

172169
fn build_format_options<'a>(
173170
data_type: &DataType,
174-
format: Option<&'a str>,
175-
) -> Result<FormatOptions<'a>, Result<ColumnarValue>> {
176-
let Some(format) = format else {
177-
return Ok(FormatOptions::new());
178-
};
171+
format: &'a str,
172+
) -> Result<FormatOptions<'a>> {
179173
let format_options = match data_type {
180174
Date32 => FormatOptions::new()
181175
.with_date_format(Some(format))
@@ -194,144 +188,114 @@ fn build_format_options<'a>(
194188
},
195189
),
196190
other => {
197-
return Err(exec_err!(
191+
return exec_err!(
198192
"to_char only supports date, time, timestamp and duration data types, received {other:?}"
199-
));
193+
);
200194
}
201195
};
202196
Ok(format_options)
203197
}
204198

205-
/// Special version when arg\[1] is a scalar
206-
fn to_char_scalar(
207-
expression: &ColumnarValue,
208-
format: Option<&str>,
209-
) -> Result<ColumnarValue> {
210-
// it's possible that the expression is a scalar however because
211-
// of the implementation in arrow-rs we need to convert it to an array
199+
/// Formats `expression` using a constant `format` string.
200+
fn to_char_scalar(expression: &ColumnarValue, format: &str) -> Result<ColumnarValue> {
201+
// ArrayFormatter requires an array, so scalar expressions must be
202+
// converted to a 1-element array first.
212203
let data_type = &expression.data_type();
213204
let is_scalar_expression = matches!(&expression, ColumnarValue::Scalar(_));
214-
let array = expression.clone().into_array(1)?;
205+
let array = expression.to_array(1)?;
215206

216-
if format.is_none() {
217-
return if is_scalar_expression {
218-
Ok(ColumnarValue::Scalar(ScalarValue::Utf8(None)))
219-
} else {
220-
Ok(ColumnarValue::Array(new_null_array(&Utf8, array.len())))
221-
};
222-
}
207+
let format_options = build_format_options(data_type, format)?;
208+
let formatter = ArrayFormatter::try_new(array.as_ref(), &format_options)?;
223209

224-
let format_options = match build_format_options(data_type, format) {
225-
Ok(value) => value,
226-
Err(value) => return value,
227-
};
210+
// Pad the preallocated capacity a bit because format specifiers often
211+
// expand the string (e.g., %Y -> "2026")
212+
let fmt_len = format.len() + 10;
213+
let mut builder = StringBuilder::with_capacity(array.len(), array.len() * fmt_len);
228214

229-
let formatter = ArrayFormatter::try_new(array.as_ref(), &format_options)?;
230-
let formatted: Result<Vec<Option<String>>, ArrowError> = (0..array.len())
231-
.map(|i| {
232-
if array.is_null(i) {
233-
Ok(None)
234-
} else {
235-
formatter.value(i).try_to_string().map(Some)
236-
}
237-
})
238-
.collect();
239-
240-
if let Ok(formatted) = formatted {
241-
if is_scalar_expression {
242-
Ok(ColumnarValue::Scalar(ScalarValue::Utf8(
243-
formatted.first().unwrap().clone(),
244-
)))
215+
for i in 0..array.len() {
216+
if array.is_null(i) {
217+
builder.append_null();
245218
} else {
246-
Ok(ColumnarValue::Array(
247-
Arc::new(StringArray::from(formatted)) as ArrayRef
248-
))
249-
}
250-
} else {
251-
// if the data type was a Date32, formatting could have failed because the format string
252-
// contained datetime specifiers, so we'll retry by casting the date array as a timestamp array
253-
if data_type == &Date32 {
254-
return to_char_scalar(&expression.cast_to(&Date64, None)?, format);
219+
// Write directly into the builder's internal buffer, then
220+
// commit the value with append_value("").
221+
match formatter.value(i).write(&mut builder) {
222+
Ok(()) => builder.append_value(""),
223+
// Arrow's Date32 formatter only handles date specifiers
224+
// (%Y, %m, %d, ...). Format strings with time specifiers
225+
// (%H, %M, %S, ...) cause it to fail. When this happens,
226+
// we retry by casting to Date64, whose datetime formatter
227+
// handles both date and time specifiers (with zero for
228+
// the time components).
229+
Err(_) if data_type == &Date32 => {
230+
return to_char_scalar(&expression.cast_to(&Date64, None)?, format);
231+
}
232+
Err(e) => return Err(e.into()),
233+
}
255234
}
235+
}
256236

257-
exec_err!("{}", formatted.unwrap_err())
237+
let result = builder.finish();
238+
if is_scalar_expression {
239+
let val = result.is_valid(0).then(|| result.value(0).to_string());
240+
Ok(ColumnarValue::Scalar(ScalarValue::Utf8(val)))
241+
} else {
242+
Ok(ColumnarValue::Array(Arc::new(result) as ArrayRef))
258243
}
259244
}
260245

261246
fn to_char_array(args: &[ColumnarValue]) -> Result<ColumnarValue> {
262247
let arrays = ColumnarValue::values_to_arrays(args)?;
263-
let mut results: Vec<Option<String>> = vec![];
248+
let data_array = &arrays[0];
264249
let format_array = arrays[1].as_string::<i32>();
265-
let data_type = arrays[0].data_type();
250+
let data_type = data_array.data_type();
266251

267-
for idx in 0..arrays[0].len() {
268-
let format = if format_array.is_null(idx) {
269-
None
270-
} else {
271-
Some(format_array.value(idx))
272-
};
273-
if format.is_none() {
274-
results.push(None);
252+
// Arbitrary guess for the length of a typical formatted datetime string
253+
let fmt_len = 30;
254+
let mut builder =
255+
StringBuilder::with_capacity(data_array.len(), data_array.len() * fmt_len);
256+
let mut buffer = String::with_capacity(fmt_len);
257+
258+
for idx in 0..data_array.len() {
259+
if format_array.is_null(idx) || data_array.is_null(idx) {
260+
builder.append_null();
275261
continue;
276262
}
277-
let format_options = match build_format_options(data_type, format) {
278-
Ok(value) => value,
279-
Err(value) => return value,
280-
};
281-
// this isn't ideal but this can't use ValueFormatter as it isn't independent
282-
// from ArrayFormatter
283-
let formatter = ArrayFormatter::try_new(arrays[0].as_ref(), &format_options)?;
284-
let result = formatter.value(idx).try_to_string();
285-
match result {
286-
Ok(value) => results.push(Some(value)),
287-
Err(e) => {
288-
// if the data type was a Date32, formatting could have failed because the format string
289-
// contained datetime specifiers, so we'll treat this specific date element as a timestamp
290-
if data_type == &Date32 {
291-
let failed_date_value = arrays[0].slice(idx, 1);
292-
293-
match retry_date_as_timestamp(&failed_date_value, &format_options) {
294-
Ok(value) => {
295-
results.push(Some(value));
296-
continue;
297-
}
298-
Err(e) => {
299-
return exec_err!("{}", e);
300-
}
301-
}
302-
}
303263

304-
return exec_err!("{}", e);
264+
let format = format_array.value(idx);
265+
let format_options = build_format_options(data_type, format)?;
266+
let formatter = ArrayFormatter::try_new(data_array.as_ref(), &format_options)?;
267+
268+
buffer.clear();
269+
270+
// We'd prefer to write directly to the StringBuilder's internal buffer,
271+
// but the write might fail, and there's no easy way to ensure a partial
272+
// write is removed from the buffer. So instead we write to a temporary
273+
// buffer and `append_value` on success.
274+
match formatter.value(idx).write(&mut buffer) {
275+
Ok(()) => builder.append_value(&buffer),
276+
// Retry with Date64 (see comment in to_char_scalar).
277+
Err(_) if data_type == &Date32 => {
278+
buffer.clear();
279+
let date64_value = cast(&data_array.slice(idx, 1), &Date64)?;
280+
let retry_fmt =
281+
ArrayFormatter::try_new(date64_value.as_ref(), &format_options)?;
282+
retry_fmt.value(0).write(&mut buffer)?;
283+
builder.append_value(&buffer);
305284
}
285+
Err(e) => return Err(e.into()),
306286
}
307287
}
308288

289+
let result = builder.finish();
309290
match args[0] {
310-
ColumnarValue::Array(_) => Ok(ColumnarValue::Array(Arc::new(StringArray::from(
311-
results,
312-
)) as ArrayRef)),
313-
ColumnarValue::Scalar(_) => match results.first().unwrap() {
314-
Some(value) => Ok(ColumnarValue::Scalar(ScalarValue::Utf8(Some(
315-
value.to_string(),
316-
)))),
317-
None => Ok(ColumnarValue::Scalar(ScalarValue::Utf8(None))),
318-
},
291+
ColumnarValue::Scalar(_) => {
292+
let val = result.is_valid(0).then(|| result.value(0).to_string());
293+
Ok(ColumnarValue::Scalar(ScalarValue::Utf8(val)))
294+
}
295+
ColumnarValue::Array(_) => Ok(ColumnarValue::Array(Arc::new(result) as ArrayRef)),
319296
}
320297
}
321298

322-
fn retry_date_as_timestamp(
323-
array_ref: &ArrayRef,
324-
format_options: &FormatOptions,
325-
) -> Result<String> {
326-
let target_data_type = Date64;
327-
328-
let date_value = cast(&array_ref, &target_data_type)?;
329-
let formatter = ArrayFormatter::try_new(date_value.as_ref(), format_options)?;
330-
let result = formatter.value(0).try_to_string()?;
331-
332-
Ok(result)
333-
}
334-
335299
#[cfg(test)]
336300
mod tests {
337301
use crate::datetime::to_char::ToCharFunc;

0 commit comments

Comments
 (0)