Skip to content

Commit f56f972

Browse files
committed
perf: Optimize to_char to reduce allocations
1 parent 6713439 commit f56f972

File tree

2 files changed

+96
-113
lines changed

2 files changed

+96
-113
lines changed

datafusion/functions/src/datetime/to_char.rs

Lines changed: 79 additions & 111 deletions
Original file line numberDiff line numberDiff line change
@@ -18,15 +18,15 @@
1818
use std::any::Any;
1919
use std::sync::Arc;
2020

21+
use arrow::array::builder::StringBuilder;
2122
use arrow::array::cast::AsArray;
22-
use arrow::array::{Array, ArrayRef, StringArray, new_null_array};
23+
use arrow::array::{Array, ArrayRef, new_null_array};
2324
use arrow::compute::cast;
2425
use arrow::datatypes::DataType;
2526
use arrow::datatypes::DataType::{
2627
Date32, Date64, Duration, Time32, Time64, Timestamp, Utf8,
2728
};
2829
use arrow::datatypes::TimeUnit::{Microsecond, Millisecond, Nanosecond, Second};
29-
use arrow::error::ArrowError;
3030
use arrow::util::display::{ArrayFormatter, DurationFormat, FormatOptions};
3131
use datafusion_common::{Result, ScalarValue, exec_err, utils::take_function_args};
3232
use datafusion_expr::TypeSignature::Exact;
@@ -143,20 +143,15 @@ impl ScalarUDFImpl for ToCharFunc {
143143
let [date_time, format] = take_function_args(self.name(), &args)?;
144144

145145
match format {
146-
ColumnarValue::Scalar(ScalarValue::Utf8(None))
147-
| ColumnarValue::Scalar(ScalarValue::Null) => to_char_scalar(date_time, None),
148-
// constant format
149-
ColumnarValue::Scalar(ScalarValue::Utf8(Some(format))) => {
150-
// invoke to_char_scalar with the known string, without converting to array
151-
to_char_scalar(date_time, Some(format))
146+
ColumnarValue::Scalar(ScalarValue::Null) => to_char_scalar(date_time, None),
147+
ColumnarValue::Scalar(ScalarValue::Utf8(fmt)) => {
148+
to_char_scalar(date_time, fmt.as_deref())
152149
}
153150
ColumnarValue::Array(_) => to_char_array(&args),
154-
_ => {
155-
exec_err!(
156-
"Format for `to_char` must be non-null Utf8, received {}",
157-
format.data_type()
158-
)
159-
}
151+
_ => exec_err!(
152+
"Format for `to_char` must be non-null Utf8, received {}",
153+
format.data_type()
154+
),
160155
}
161156
}
162157

@@ -172,7 +167,7 @@ impl ScalarUDFImpl for ToCharFunc {
172167
fn build_format_options<'a>(
173168
data_type: &DataType,
174169
format: Option<&'a str>,
175-
) -> Result<FormatOptions<'a>, Result<ColumnarValue>> {
170+
) -> Result<FormatOptions<'a>> {
176171
let Some(format) = format else {
177172
return Ok(FormatOptions::new());
178173
};
@@ -194,24 +189,24 @@ fn build_format_options<'a>(
194189
},
195190
),
196191
other => {
197-
return Err(exec_err!(
192+
return exec_err!(
198193
"to_char only supports date, time, timestamp and duration data types, received {other:?}"
199-
));
194+
);
200195
}
201196
};
202197
Ok(format_options)
203198
}
204199

205-
/// Special version when arg\[1] is a scalar
200+
/// Formats `expression` using a constant `format` string.
206201
fn to_char_scalar(
207202
expression: &ColumnarValue,
208203
format: Option<&str>,
209204
) -> Result<ColumnarValue> {
210-
// it's possible that the expression is a scalar however because
211-
// of the implementation in arrow-rs we need to convert it to an array
205+
// ArrayFormatter requires an array, so scalar expressions must be
206+
// converted to a 1-element array first.
212207
let data_type = &expression.data_type();
213208
let is_scalar_expression = matches!(&expression, ColumnarValue::Scalar(_));
214-
let array = expression.clone().into_array(1)?;
209+
let array = expression.to_array(1)?;
215210

216211
if format.is_none() {
217212
return if is_scalar_expression {
@@ -221,117 +216,90 @@ fn to_char_scalar(
221216
};
222217
}
223218

224-
let format_options = match build_format_options(data_type, format) {
225-
Ok(value) => value,
226-
Err(value) => return value,
227-
};
228-
219+
let format_options = build_format_options(data_type, format)?;
229220
let formatter = ArrayFormatter::try_new(array.as_ref(), &format_options)?;
230-
let formatted: Result<Vec<Option<String>>, ArrowError> = (0..array.len())
231-
.map(|i| {
232-
if array.is_null(i) {
233-
Ok(None)
234-
} else {
235-
formatter.value(i).try_to_string().map(Some)
236-
}
237-
})
238-
.collect();
239-
240-
if let Ok(formatted) = formatted {
241-
if is_scalar_expression {
242-
Ok(ColumnarValue::Scalar(ScalarValue::Utf8(
243-
formatted.first().unwrap().clone(),
244-
)))
221+
222+
let fmt_len = format.map_or(20, |f| f.len() + 10);
223+
let mut builder = StringBuilder::with_capacity(array.len(), array.len() * fmt_len);
224+
let mut buffer = String::with_capacity(fmt_len);
225+
226+
for i in 0..array.len() {
227+
if array.is_null(i) {
228+
builder.append_null();
245229
} else {
246-
Ok(ColumnarValue::Array(
247-
Arc::new(StringArray::from(formatted)) as ArrayRef
248-
))
249-
}
250-
} else {
251-
// if the data type was a Date32, formatting could have failed because the format string
252-
// contained datetime specifiers, so we'll retry by casting the date array as a timestamp array
253-
if data_type == &Date32 {
254-
return to_char_scalar(&expression.cast_to(&Date64, None)?, format);
230+
buffer.clear();
231+
match formatter.value(i).write(&mut buffer) {
232+
Ok(()) => builder.append_value(&buffer),
233+
// Arrow's Date32 formatter only handles date specifiers
234+
// (%Y, %m, %d, ...). Format strings with time specifiers
235+
// (%H, %M, %S, ...) cause it to fail. When this happens,
236+
// we retry by casting to Date64, whose datetime formatter
237+
// handles both date and time specifiers (with zero for
238+
// the time components).
239+
Err(_) if data_type == &Date32 => {
240+
return to_char_scalar(&expression.cast_to(&Date64, None)?, format);
241+
}
242+
Err(e) => return Err(e.into()),
243+
}
255244
}
245+
}
256246

257-
exec_err!("{}", formatted.unwrap_err())
247+
let result = builder.finish();
248+
if is_scalar_expression {
249+
let val = result.is_valid(0).then(|| result.value(0).to_string());
250+
Ok(ColumnarValue::Scalar(ScalarValue::Utf8(val)))
251+
} else {
252+
Ok(ColumnarValue::Array(Arc::new(result) as ArrayRef))
258253
}
259254
}
260255

261256
fn to_char_array(args: &[ColumnarValue]) -> Result<ColumnarValue> {
262257
let arrays = ColumnarValue::values_to_arrays(args)?;
263-
let mut results: Vec<Option<String>> = vec![];
258+
let data_array = &arrays[0];
264259
let format_array = arrays[1].as_string::<i32>();
265-
let data_type = arrays[0].data_type();
260+
let data_type = data_array.data_type();
266261

267-
for idx in 0..arrays[0].len() {
268-
let format = if format_array.is_null(idx) {
269-
None
270-
} else {
271-
Some(format_array.value(idx))
272-
};
273-
if format.is_none() {
274-
results.push(None);
262+
let fmt_len = 30;
263+
let mut builder =
264+
StringBuilder::with_capacity(data_array.len(), data_array.len() * fmt_len);
265+
let mut buffer = String::with_capacity(fmt_len);
266+
267+
for idx in 0..data_array.len() {
268+
if format_array.is_null(idx) || data_array.is_null(idx) {
269+
builder.append_null();
275270
continue;
276271
}
277-
let format_options = match build_format_options(data_type, format) {
278-
Ok(value) => value,
279-
Err(value) => return value,
280-
};
281-
// this isn't ideal but this can't use ValueFormatter as it isn't independent
282-
// from ArrayFormatter
283-
let formatter = ArrayFormatter::try_new(arrays[0].as_ref(), &format_options)?;
284-
let result = formatter.value(idx).try_to_string();
285-
match result {
286-
Ok(value) => results.push(Some(value)),
287-
Err(e) => {
288-
// if the data type was a Date32, formatting could have failed because the format string
289-
// contained datetime specifiers, so we'll treat this specific date element as a timestamp
290-
if data_type == &Date32 {
291-
let failed_date_value = arrays[0].slice(idx, 1);
292-
293-
match retry_date_as_timestamp(&failed_date_value, &format_options) {
294-
Ok(value) => {
295-
results.push(Some(value));
296-
continue;
297-
}
298-
Err(e) => {
299-
return exec_err!("{}", e);
300-
}
301-
}
302-
}
303-
304-
return exec_err!("{}", e);
272+
let format = Some(format_array.value(idx));
273+
274+
buffer.clear();
275+
let format_options = build_format_options(data_type, format)?;
276+
let formatter = ArrayFormatter::try_new(data_array.as_ref(), &format_options)?;
277+
278+
match formatter.value(idx).write(&mut buffer) {
279+
Ok(()) => builder.append_value(&buffer),
280+
Err(_) if data_type == &Date32 => {
281+
// Retry with Date64 (see comment in to_char_scalar).
282+
buffer.clear();
283+
let date64_value = cast(&data_array.slice(idx, 1), &Date64)?;
284+
let retry_fmt =
285+
ArrayFormatter::try_new(date64_value.as_ref(), &format_options)?;
286+
retry_fmt.value(0).write(&mut buffer)?;
287+
builder.append_value(&buffer);
305288
}
289+
Err(e) => return Err(e.into()),
306290
}
307291
}
308292

293+
let result = builder.finish();
309294
match args[0] {
310-
ColumnarValue::Array(_) => Ok(ColumnarValue::Array(Arc::new(StringArray::from(
311-
results,
312-
)) as ArrayRef)),
313-
ColumnarValue::Scalar(_) => match results.first().unwrap() {
314-
Some(value) => Ok(ColumnarValue::Scalar(ScalarValue::Utf8(Some(
315-
value.to_string(),
316-
)))),
317-
None => Ok(ColumnarValue::Scalar(ScalarValue::Utf8(None))),
318-
},
295+
ColumnarValue::Scalar(_) => {
296+
let val = result.is_valid(0).then(|| result.value(0).to_string());
297+
Ok(ColumnarValue::Scalar(ScalarValue::Utf8(val)))
298+
}
299+
ColumnarValue::Array(_) => Ok(ColumnarValue::Array(Arc::new(result) as ArrayRef)),
319300
}
320301
}
321302

322-
fn retry_date_as_timestamp(
323-
array_ref: &ArrayRef,
324-
format_options: &FormatOptions,
325-
) -> Result<String> {
326-
let target_data_type = Date64;
327-
328-
let date_value = cast(&array_ref, &target_data_type)?;
329-
let formatter = ArrayFormatter::try_new(date_value.as_ref(), format_options)?;
330-
let result = formatter.value(0).try_to_string()?;
331-
332-
Ok(result)
333-
}
334-
335303
#[cfg(test)]
336304
mod tests {
337305
use crate::datetime::to_char::ToCharFunc;

datafusion/sqllogictest/test_files/datetime/timestamps.slt

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3676,10 +3676,10 @@ select to_char(arrow_cast(123456, 'Duration(Second)'), null);
36763676
----
36773677
NULL
36783678

3679-
query error DataFusion error: Execution error: Cast error: Format error
3679+
query error DataFusion error: Arrow error: Cast error: Format error
36803680
SELECT to_char(timestamps, '%X%K') from formats;
36813681

3682-
query error DataFusion error: Execution error: Cast error: Format error
3682+
query error DataFusion error: Arrow error: Cast error: Format error
36833683
SELECT to_char('2000-02-03'::date, '%X%K');
36843684

36853685
query T
@@ -3726,6 +3726,21 @@ select to_char('2020-01-01 00:10:20.123'::timestamp at time zone 'America/New_Yo
37263726
----
37273727
2020-01-01 00:10:20.123
37283728

3729+
# Null values with array format
3730+
query T
3731+
SELECT to_char(column1, column2)
3732+
FROM (VALUES
3733+
(DATE '2020-09-01', '%Y-%m-%d'),
3734+
(NULL, '%Y-%m-%d'),
3735+
(DATE '2020-09-02', NULL),
3736+
(NULL, NULL)
3737+
);
3738+
----
3739+
2020-09-01
3740+
NULL
3741+
NULL
3742+
NULL
3743+
37293744
statement ok
37303745
drop table formats;
37313746

0 commit comments

Comments
 (0)