Skip to content

Commit 0a1245c

Browse files
committed
perf: Optimize to_char to reduce allocations
1 parent 6713439 commit 0a1245c

File tree

3 files changed

+124
-148
lines changed

3 files changed

+124
-148
lines changed

datafusion/functions/benches/to_char.rs

Lines changed: 24 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -18,13 +18,12 @@
1818
use std::hint::black_box;
1919
use std::sync::Arc;
2020

21-
use arrow::array::{ArrayRef, Date32Array, StringArray};
21+
use arrow::array::{ArrayRef, Date32Array, Date64Array, StringArray};
2222
use arrow::datatypes::{DataType, Field};
2323
use chrono::TimeDelta;
2424
use chrono::prelude::*;
2525
use criterion::{Criterion, criterion_group, criterion_main};
2626
use datafusion_common::ScalarValue;
27-
use datafusion_common::ScalarValue::TimestampNanosecond;
2827
use datafusion_common::config::ConfigOptions;
2928
use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
3029
use datafusion_functions::datetime::to_char;
@@ -63,6 +62,26 @@ fn generate_date32_array(rng: &mut ThreadRng) -> Date32Array {
6362
Date32Array::from(data)
6463
}
6564

65+
fn generate_date64_array(rng: &mut ThreadRng) -> Date64Array {
66+
let start_date = "1970-01-01"
67+
.parse::<NaiveDate>()
68+
.expect("Date should parse");
69+
let end_date = "2050-12-31"
70+
.parse::<NaiveDate>()
71+
.expect("Date should parse");
72+
let mut data: Vec<i64> = Vec::with_capacity(1000);
73+
for _ in 0..1000 {
74+
let date = pick_date_in_range(rng, start_date, end_date);
75+
let millis = date
76+
.and_hms_opt(0, 0, 0)
77+
.unwrap()
78+
.and_utc()
79+
.timestamp_millis();
80+
data.push(millis);
81+
}
82+
Date64Array::from(data)
83+
}
84+
6685
const DATE_PATTERNS: [&str; 5] =
6786
["%Y:%m:%d", "%d-%m-%Y", "%d%m%Y", "%Y%m%d", "%Y...%m...%d"];
6887

@@ -155,7 +174,7 @@ fn criterion_benchmark(c: &mut Criterion) {
155174

156175
c.bench_function("to_char_array_datetime_patterns_1000", |b| {
157176
let mut rng = rand::rng();
158-
let data_arr = generate_date32_array(&mut rng);
177+
let data_arr = generate_date64_array(&mut rng);
159178
let batch_len = data_arr.len();
160179
let data = ColumnarValue::Array(Arc::new(data_arr) as ArrayRef);
161180
let patterns = ColumnarValue::Array(Arc::new(generate_datetime_pattern_array(
@@ -182,7 +201,7 @@ fn criterion_benchmark(c: &mut Criterion) {
182201

183202
c.bench_function("to_char_array_mixed_patterns_1000", |b| {
184203
let mut rng = rand::rng();
185-
let data_arr = generate_date32_array(&mut rng);
204+
let data_arr = generate_date64_array(&mut rng);
186205
let batch_len = data_arr.len();
187206
let data = ColumnarValue::Array(Arc::new(data_arr) as ArrayRef);
188207
let patterns = ColumnarValue::Array(Arc::new(generate_mixed_pattern_array(
@@ -235,7 +254,7 @@ fn criterion_benchmark(c: &mut Criterion) {
235254

236255
c.bench_function("to_char_scalar_datetime_pattern_1000", |b| {
237256
let mut rng = rand::rng();
238-
let data_arr = generate_date32_array(&mut rng);
257+
let data_arr = generate_date64_array(&mut rng);
239258
let batch_len = data_arr.len();
240259
let data = ColumnarValue::Array(Arc::new(data_arr) as ArrayRef);
241260
let patterns = ColumnarValue::Scalar(ScalarValue::Utf8(Some(
@@ -260,37 +279,6 @@ fn criterion_benchmark(c: &mut Criterion) {
260279
})
261280
});
262281

263-
c.bench_function("to_char_scalar_1000", |b| {
264-
let mut rng = rand::rng();
265-
let timestamp = "2026-07-08T09:10:11"
266-
.parse::<NaiveDateTime>()
267-
.unwrap()
268-
.with_nanosecond(56789)
269-
.unwrap()
270-
.and_utc()
271-
.timestamp_nanos_opt()
272-
.unwrap();
273-
let data = ColumnarValue::Scalar(TimestampNanosecond(Some(timestamp), None));
274-
let pattern =
275-
ColumnarValue::Scalar(ScalarValue::Utf8(Some(pick_date_pattern(&mut rng))));
276-
277-
b.iter(|| {
278-
black_box(
279-
to_char()
280-
.invoke_with_args(ScalarFunctionArgs {
281-
args: vec![data.clone(), pattern.clone()],
282-
arg_fields: vec![
283-
Field::new("a", data.data_type(), true).into(),
284-
Field::new("b", pattern.data_type(), true).into(),
285-
],
286-
number_rows: 1,
287-
return_field: Field::new("f", DataType::Utf8, true).into(),
288-
config_options: Arc::clone(&config_options),
289-
})
290-
.expect("to_char should work on valid values"),
291-
)
292-
})
293-
});
294282
}
295283

296284
criterion_group!(benches, criterion_benchmark);

datafusion/functions/src/datetime/to_char.rs

Lines changed: 83 additions & 110 deletions
Original file line numberDiff line numberDiff line change
@@ -18,15 +18,15 @@
1818
use std::any::Any;
1919
use std::sync::Arc;
2020

21+
use arrow::array::builder::StringBuilder;
2122
use arrow::array::cast::AsArray;
22-
use arrow::array::{Array, ArrayRef, StringArray, new_null_array};
23+
use arrow::array::{Array, ArrayRef, new_null_array};
2324
use arrow::compute::cast;
2425
use arrow::datatypes::DataType;
2526
use arrow::datatypes::DataType::{
2627
Date32, Date64, Duration, Time32, Time64, Timestamp, Utf8,
2728
};
2829
use arrow::datatypes::TimeUnit::{Microsecond, Millisecond, Nanosecond, Second};
29-
use arrow::error::ArrowError;
3030
use arrow::util::display::{ArrayFormatter, DurationFormat, FormatOptions};
3131
use datafusion_common::{Result, ScalarValue, exec_err, utils::take_function_args};
3232
use datafusion_expr::TypeSignature::Exact;
@@ -143,20 +143,15 @@ impl ScalarUDFImpl for ToCharFunc {
143143
let [date_time, format] = take_function_args(self.name(), &args)?;
144144

145145
match format {
146-
ColumnarValue::Scalar(ScalarValue::Utf8(None))
147-
| ColumnarValue::Scalar(ScalarValue::Null) => to_char_scalar(date_time, None),
148-
// constant format
149-
ColumnarValue::Scalar(ScalarValue::Utf8(Some(format))) => {
150-
// invoke to_char_scalar with the known string, without converting to array
151-
to_char_scalar(date_time, Some(format))
146+
ColumnarValue::Scalar(ScalarValue::Null) => to_char_scalar(date_time, None),
147+
ColumnarValue::Scalar(ScalarValue::Utf8(fmt)) => {
148+
to_char_scalar(date_time, fmt.as_deref())
152149
}
153150
ColumnarValue::Array(_) => to_char_array(&args),
154-
_ => {
155-
exec_err!(
156-
"Format for `to_char` must be non-null Utf8, received {}",
157-
format.data_type()
158-
)
159-
}
151+
_ => exec_err!(
152+
"Format for `to_char` must be non-null Utf8, received {}",
153+
format.data_type()
154+
),
160155
}
161156
}
162157

@@ -172,7 +167,7 @@ impl ScalarUDFImpl for ToCharFunc {
172167
fn build_format_options<'a>(
173168
data_type: &DataType,
174169
format: Option<&'a str>,
175-
) -> Result<FormatOptions<'a>, Result<ColumnarValue>> {
170+
) -> Result<FormatOptions<'a>> {
176171
let Some(format) = format else {
177172
return Ok(FormatOptions::new());
178173
};
@@ -194,24 +189,24 @@ fn build_format_options<'a>(
194189
},
195190
),
196191
other => {
197-
return Err(exec_err!(
192+
return exec_err!(
198193
"to_char only supports date, time, timestamp and duration data types, received {other:?}"
199-
));
194+
);
200195
}
201196
};
202197
Ok(format_options)
203198
}
204199

205-
/// Special version when arg\[1] is a scalar
200+
/// Formats `expression` using a constant `format` string.
206201
fn to_char_scalar(
207202
expression: &ColumnarValue,
208203
format: Option<&str>,
209204
) -> Result<ColumnarValue> {
210-
// it's possible that the expression is a scalar however because
211-
// of the implementation in arrow-rs we need to convert it to an array
205+
// ArrayFormatter requires an array, so scalar expressions must be
206+
// converted to a 1-element array first.
212207
let data_type = &expression.data_type();
213208
let is_scalar_expression = matches!(&expression, ColumnarValue::Scalar(_));
214-
let array = expression.clone().into_array(1)?;
209+
let array = expression.to_array(1)?;
215210

216211
if format.is_none() {
217212
return if is_scalar_expression {
@@ -221,117 +216,95 @@ fn to_char_scalar(
221216
};
222217
}
223218

224-
let format_options = match build_format_options(data_type, format) {
225-
Ok(value) => value,
226-
Err(value) => return value,
227-
};
228-
219+
let format_options = build_format_options(data_type, format)?;
229220
let formatter = ArrayFormatter::try_new(array.as_ref(), &format_options)?;
230-
let formatted: Result<Vec<Option<String>>, ArrowError> = (0..array.len())
231-
.map(|i| {
232-
if array.is_null(i) {
233-
Ok(None)
234-
} else {
235-
formatter.value(i).try_to_string().map(Some)
236-
}
237-
})
238-
.collect();
239-
240-
if let Ok(formatted) = formatted {
241-
if is_scalar_expression {
242-
Ok(ColumnarValue::Scalar(ScalarValue::Utf8(
243-
formatted.first().unwrap().clone(),
244-
)))
221+
222+
let fmt_len = format.map_or(20, |f| f.len() + 10);
223+
let mut builder = StringBuilder::with_capacity(array.len(), array.len() * fmt_len);
224+
225+
for i in 0..array.len() {
226+
if array.is_null(i) {
227+
builder.append_null();
245228
} else {
246-
Ok(ColumnarValue::Array(
247-
Arc::new(StringArray::from(formatted)) as ArrayRef
248-
))
249-
}
250-
} else {
251-
// if the data type was a Date32, formatting could have failed because the format string
252-
// contained datetime specifiers, so we'll retry by casting the date array as a timestamp array
253-
if data_type == &Date32 {
254-
return to_char_scalar(&expression.cast_to(&Date64, None)?, format);
229+
// Write directly into the builder's internal buffer, then
230+
// commit the value with append_value("").
231+
match formatter.value(i).write(&mut builder) {
232+
Ok(()) => builder.append_value(""),
233+
// Arrow's Date32 formatter only handles date specifiers
234+
// (%Y, %m, %d, ...). Format strings with time specifiers
235+
// (%H, %M, %S, ...) cause it to fail. When this happens,
236+
// we retry by casting to Date64, whose datetime formatter
237+
// handles both date and time specifiers (with zero for
238+
// the time components).
239+
Err(_) if data_type == &Date32 => {
240+
return to_char_scalar(&expression.cast_to(&Date64, None)?, format);
241+
}
242+
Err(e) => return Err(e.into()),
243+
}
255244
}
245+
}
256246

257-
exec_err!("{}", formatted.unwrap_err())
247+
let result = builder.finish();
248+
if is_scalar_expression {
249+
let val = result.is_valid(0).then(|| result.value(0).to_string());
250+
Ok(ColumnarValue::Scalar(ScalarValue::Utf8(val)))
251+
} else {
252+
Ok(ColumnarValue::Array(Arc::new(result) as ArrayRef))
258253
}
259254
}
260255

261256
fn to_char_array(args: &[ColumnarValue]) -> Result<ColumnarValue> {
262257
let arrays = ColumnarValue::values_to_arrays(args)?;
263-
let mut results: Vec<Option<String>> = vec![];
258+
let data_array = &arrays[0];
264259
let format_array = arrays[1].as_string::<i32>();
265-
let data_type = arrays[0].data_type();
260+
let data_type = data_array.data_type();
266261

267-
for idx in 0..arrays[0].len() {
268-
let format = if format_array.is_null(idx) {
269-
None
270-
} else {
271-
Some(format_array.value(idx))
272-
};
273-
if format.is_none() {
274-
results.push(None);
262+
let fmt_len = 30;
263+
let mut builder =
264+
StringBuilder::with_capacity(data_array.len(), data_array.len() * fmt_len);
265+
let mut buffer = String::with_capacity(fmt_len);
266+
267+
for idx in 0..data_array.len() {
268+
if format_array.is_null(idx) || data_array.is_null(idx) {
269+
builder.append_null();
275270
continue;
276271
}
277-
let format_options = match build_format_options(data_type, format) {
278-
Ok(value) => value,
279-
Err(value) => return value,
280-
};
281-
// this isn't ideal but this can't use ValueFormatter as it isn't independent
282-
// from ArrayFormatter
283-
let formatter = ArrayFormatter::try_new(arrays[0].as_ref(), &format_options)?;
284-
let result = formatter.value(idx).try_to_string();
285-
match result {
286-
Ok(value) => results.push(Some(value)),
287-
Err(e) => {
288-
// if the data type was a Date32, formatting could have failed because the format string
289-
// contained datetime specifiers, so we'll treat this specific date element as a timestamp
290-
if data_type == &Date32 {
291-
let failed_date_value = arrays[0].slice(idx, 1);
292-
293-
match retry_date_as_timestamp(&failed_date_value, &format_options) {
294-
Ok(value) => {
295-
results.push(Some(value));
296-
continue;
297-
}
298-
Err(e) => {
299-
return exec_err!("{}", e);
300-
}
301-
}
302-
}
303272

304-
return exec_err!("{}", e);
273+
let format = Some(format_array.value(idx));
274+
let format_options = build_format_options(data_type, format)?;
275+
let formatter = ArrayFormatter::try_new(data_array.as_ref(), &format_options)?;
276+
277+
buffer.clear();
278+
279+
// We'd prefer to write directly to the StringBuilder's internal buffer,
280+
// but the write might fail, and there's no easy way to ensure a partial
281+
// write is removed from the buffer. So instead we write to a temporary
282+
// buffer and `append_value` on success.
283+
match formatter.value(idx).write(&mut buffer) {
284+
Ok(()) => builder.append_value(&buffer),
285+
// Retry with Date64 (see comment in to_char_scalar).
286+
Err(_) if data_type == &Date32 => {
287+
buffer.clear();
288+
let date64_value = cast(&data_array.slice(idx, 1), &Date64)?;
289+
let retry_fmt =
290+
ArrayFormatter::try_new(date64_value.as_ref(), &format_options)?;
291+
retry_fmt.value(0).write(&mut buffer)?;
292+
builder.append_value(&buffer);
305293
}
294+
Err(e) => return Err(e.into()),
306295
}
307296
}
308297

298+
let result = builder.finish();
309299
match args[0] {
310-
ColumnarValue::Array(_) => Ok(ColumnarValue::Array(Arc::new(StringArray::from(
311-
results,
312-
)) as ArrayRef)),
313-
ColumnarValue::Scalar(_) => match results.first().unwrap() {
314-
Some(value) => Ok(ColumnarValue::Scalar(ScalarValue::Utf8(Some(
315-
value.to_string(),
316-
)))),
317-
None => Ok(ColumnarValue::Scalar(ScalarValue::Utf8(None))),
318-
},
300+
ColumnarValue::Scalar(_) => {
301+
let val = result.is_valid(0).then(|| result.value(0).to_string());
302+
Ok(ColumnarValue::Scalar(ScalarValue::Utf8(val)))
303+
}
304+
ColumnarValue::Array(_) => Ok(ColumnarValue::Array(Arc::new(result) as ArrayRef)),
319305
}
320306
}
321307

322-
fn retry_date_as_timestamp(
323-
array_ref: &ArrayRef,
324-
format_options: &FormatOptions,
325-
) -> Result<String> {
326-
let target_data_type = Date64;
327-
328-
let date_value = cast(&array_ref, &target_data_type)?;
329-
let formatter = ArrayFormatter::try_new(date_value.as_ref(), format_options)?;
330-
let result = formatter.value(0).try_to_string()?;
331-
332-
Ok(result)
333-
}
334-
335308
#[cfg(test)]
336309
mod tests {
337310
use crate::datetime::to_char::ToCharFunc;

0 commit comments

Comments
 (0)