1818use std:: any:: Any ;
1919use std:: sync:: Arc ;
2020
21+ use arrow:: array:: builder:: StringBuilder ;
2122use arrow:: array:: cast:: AsArray ;
22- use arrow:: array:: { Array , ArrayRef , StringArray , new_null_array } ;
23+ use arrow:: array:: { Array , ArrayRef } ;
2324use arrow:: compute:: cast;
2425use arrow:: datatypes:: DataType ;
2526use arrow:: datatypes:: DataType :: {
2627 Date32 , Date64 , Duration , Time32 , Time64 , Timestamp , Utf8 ,
2728} ;
2829use arrow:: datatypes:: TimeUnit :: { Microsecond , Millisecond , Nanosecond , Second } ;
29- use arrow:: error:: ArrowError ;
3030use arrow:: util:: display:: { ArrayFormatter , DurationFormat , FormatOptions } ;
3131use datafusion_common:: { Result , ScalarValue , exec_err, utils:: take_function_args} ;
3232use datafusion_expr:: TypeSignature :: Exact ;
@@ -143,20 +143,17 @@ impl ScalarUDFImpl for ToCharFunc {
143143 let [ date_time, format] = take_function_args ( self . name ( ) , & args) ?;
144144
145145 match format {
146- ColumnarValue :: Scalar ( ScalarValue :: Utf8 ( None ) )
147- | ColumnarValue :: Scalar ( ScalarValue :: Null ) => to_char_scalar ( date_time, None ) ,
148- // constant format
149- ColumnarValue :: Scalar ( ScalarValue :: Utf8 ( Some ( format) ) ) => {
150- // invoke to_char_scalar with the known string, without converting to array
151- to_char_scalar ( date_time, Some ( format) )
146+ ColumnarValue :: Scalar ( ScalarValue :: Null | ScalarValue :: Utf8 ( None ) ) => {
147+ Ok ( ColumnarValue :: Scalar ( ScalarValue :: Utf8 ( None ) ) )
152148 }
153- ColumnarValue :: Array ( _) => to_char_array ( & args) ,
154- _ => {
155- exec_err ! (
156- "Format for `to_char` must be non-null Utf8, received {}" ,
157- format. data_type( )
158- )
149+ ColumnarValue :: Scalar ( ScalarValue :: Utf8 ( Some ( fmt) ) ) => {
150+ to_char_scalar ( date_time, fmt)
159151 }
152+ ColumnarValue :: Array ( _) => to_char_array ( & args) ,
153+ _ => exec_err ! (
154+ "Format for `to_char` must be non-null Utf8, received {}" ,
155+ format. data_type( )
156+ ) ,
160157 }
161158 }
162159
@@ -171,11 +168,8 @@ impl ScalarUDFImpl for ToCharFunc {
171168
172169fn build_format_options < ' a > (
173170 data_type : & DataType ,
174- format : Option < & ' a str > ,
175- ) -> Result < FormatOptions < ' a > , Result < ColumnarValue > > {
176- let Some ( format) = format else {
177- return Ok ( FormatOptions :: new ( ) ) ;
178- } ;
171+ format : & ' a str ,
172+ ) -> Result < FormatOptions < ' a > > {
179173 let format_options = match data_type {
180174 Date32 => FormatOptions :: new ( )
181175 . with_date_format ( Some ( format) )
@@ -194,144 +188,114 @@ fn build_format_options<'a>(
194188 } ,
195189 ) ,
196190 other => {
197- return Err ( exec_err ! (
191+ return exec_err ! (
198192 "to_char only supports date, time, timestamp and duration data types, received {other:?}"
199- ) ) ;
193+ ) ;
200194 }
201195 } ;
202196 Ok ( format_options)
203197}
204198
205- /// Special version when arg\[1] is a scalar
206- fn to_char_scalar (
207- expression : & ColumnarValue ,
208- format : Option < & str > ,
209- ) -> Result < ColumnarValue > {
210- // it's possible that the expression is a scalar however because
211- // of the implementation in arrow-rs we need to convert it to an array
199+ /// Formats `expression` using a constant `format` string.
200+ fn to_char_scalar ( expression : & ColumnarValue , format : & str ) -> Result < ColumnarValue > {
201+ // ArrayFormatter requires an array, so scalar expressions must be
202+ // converted to a 1-element array first.
212203 let data_type = & expression. data_type ( ) ;
213204 let is_scalar_expression = matches ! ( & expression, ColumnarValue :: Scalar ( _) ) ;
214- let array = expression. clone ( ) . into_array ( 1 ) ?;
205+ let array = expression. to_array ( 1 ) ?;
215206
216- if format. is_none ( ) {
217- return if is_scalar_expression {
218- Ok ( ColumnarValue :: Scalar ( ScalarValue :: Utf8 ( None ) ) )
219- } else {
220- Ok ( ColumnarValue :: Array ( new_null_array ( & Utf8 , array. len ( ) ) ) )
221- } ;
222- }
207+ let format_options = build_format_options ( data_type, format) ?;
208+ let formatter = ArrayFormatter :: try_new ( array. as_ref ( ) , & format_options) ?;
223209
224- let format_options = match build_format_options ( data_type , format) {
225- Ok ( value ) => value ,
226- Err ( value ) => return value ,
227- } ;
210+ // Pad the preallocated capacity a bit because format specifiers often
211+ // expand the string (e.g., %Y -> "2026")
212+ let fmt_len = format . len ( ) + 10 ;
213+ let mut builder = StringBuilder :: with_capacity ( array . len ( ) , array . len ( ) * fmt_len ) ;
228214
229- let formatter = ArrayFormatter :: try_new ( array. as_ref ( ) , & format_options) ?;
230- let formatted: Result < Vec < Option < String > > , ArrowError > = ( 0 ..array. len ( ) )
231- . map ( |i| {
232- if array. is_null ( i) {
233- Ok ( None )
234- } else {
235- formatter. value ( i) . try_to_string ( ) . map ( Some )
236- }
237- } )
238- . collect ( ) ;
239-
240- if let Ok ( formatted) = formatted {
241- if is_scalar_expression {
242- Ok ( ColumnarValue :: Scalar ( ScalarValue :: Utf8 (
243- formatted. first ( ) . unwrap ( ) . clone ( ) ,
244- ) ) )
215+ for i in 0 ..array. len ( ) {
216+ if array. is_null ( i) {
217+ builder. append_null ( ) ;
245218 } else {
246- Ok ( ColumnarValue :: Array (
247- Arc :: new ( StringArray :: from ( formatted) ) as ArrayRef
248- ) )
249- }
250- } else {
251- // if the data type was a Date32, formatting could have failed because the format string
252- // contained datetime specifiers, so we'll retry by casting the date array as a timestamp array
253- if data_type == & Date32 {
254- return to_char_scalar ( & expression. cast_to ( & Date64 , None ) ?, format) ;
219+ // Write directly into the builder's internal buffer, then
220+ // commit the value with append_value("").
221+ match formatter. value ( i) . write ( & mut builder) {
222+ Ok ( ( ) ) => builder. append_value ( "" ) ,
223+ // Arrow's Date32 formatter only handles date specifiers
224+ // (%Y, %m, %d, ...). Format strings with time specifiers
225+ // (%H, %M, %S, ...) cause it to fail. When this happens,
226+ // we retry by casting to Date64, whose datetime formatter
227+ // handles both date and time specifiers (with zero for
228+ // the time components).
229+ Err ( _) if data_type == & Date32 => {
230+ return to_char_scalar ( & expression. cast_to ( & Date64 , None ) ?, format) ;
231+ }
232+ Err ( e) => return Err ( e. into ( ) ) ,
233+ }
255234 }
235+ }
256236
257- exec_err ! ( "{}" , formatted. unwrap_err( ) )
237+ let result = builder. finish ( ) ;
238+ if is_scalar_expression {
239+ let val = result. is_valid ( 0 ) . then ( || result. value ( 0 ) . to_string ( ) ) ;
240+ Ok ( ColumnarValue :: Scalar ( ScalarValue :: Utf8 ( val) ) )
241+ } else {
242+ Ok ( ColumnarValue :: Array ( Arc :: new ( result) as ArrayRef ) )
258243 }
259244}
260245
261246fn to_char_array ( args : & [ ColumnarValue ] ) -> Result < ColumnarValue > {
262247 let arrays = ColumnarValue :: values_to_arrays ( args) ?;
263- let mut results : Vec < Option < String > > = vec ! [ ] ;
248+ let data_array = & arrays [ 0 ] ;
264249 let format_array = arrays[ 1 ] . as_string :: < i32 > ( ) ;
265- let data_type = arrays [ 0 ] . data_type ( ) ;
250+ let data_type = data_array . data_type ( ) ;
266251
267- for idx in 0 ..arrays[ 0 ] . len ( ) {
268- let format = if format_array. is_null ( idx) {
269- None
270- } else {
271- Some ( format_array. value ( idx) )
272- } ;
273- if format. is_none ( ) {
274- results. push ( None ) ;
252+ // Arbitrary guess for the length of a typical formatted datetime string
253+ let fmt_len = 30 ;
254+ let mut builder =
255+ StringBuilder :: with_capacity ( data_array. len ( ) , data_array. len ( ) * fmt_len) ;
256+ let mut buffer = String :: with_capacity ( fmt_len) ;
257+
258+ for idx in 0 ..data_array. len ( ) {
259+ if format_array. is_null ( idx) || data_array. is_null ( idx) {
260+ builder. append_null ( ) ;
275261 continue ;
276262 }
277- let format_options = match build_format_options ( data_type, format) {
278- Ok ( value) => value,
279- Err ( value) => return value,
280- } ;
281- // this isn't ideal but this can't use ValueFormatter as it isn't independent
282- // from ArrayFormatter
283- let formatter = ArrayFormatter :: try_new ( arrays[ 0 ] . as_ref ( ) , & format_options) ?;
284- let result = formatter. value ( idx) . try_to_string ( ) ;
285- match result {
286- Ok ( value) => results. push ( Some ( value) ) ,
287- Err ( e) => {
288- // if the data type was a Date32, formatting could have failed because the format string
289- // contained datetime specifiers, so we'll treat this specific date element as a timestamp
290- if data_type == & Date32 {
291- let failed_date_value = arrays[ 0 ] . slice ( idx, 1 ) ;
292-
293- match retry_date_as_timestamp ( & failed_date_value, & format_options) {
294- Ok ( value) => {
295- results. push ( Some ( value) ) ;
296- continue ;
297- }
298- Err ( e) => {
299- return exec_err ! ( "{}" , e) ;
300- }
301- }
302- }
303263
304- return exec_err ! ( "{}" , e) ;
264+ let format = format_array. value ( idx) ;
265+ let format_options = build_format_options ( data_type, format) ?;
266+ let formatter = ArrayFormatter :: try_new ( data_array. as_ref ( ) , & format_options) ?;
267+
268+ buffer. clear ( ) ;
269+
270+ // We'd prefer to write directly to the StringBuilder's internal buffer,
271+ // but the write might fail, and there's no easy way to ensure a partial
272+ // write is removed from the buffer. So instead we write to a temporary
273+ // buffer and `append_value` on success.
274+ match formatter. value ( idx) . write ( & mut buffer) {
275+ Ok ( ( ) ) => builder. append_value ( & buffer) ,
276+ // Retry with Date64 (see comment in to_char_scalar).
277+ Err ( _) if data_type == & Date32 => {
278+ buffer. clear ( ) ;
279+ let date64_value = cast ( & data_array. slice ( idx, 1 ) , & Date64 ) ?;
280+ let retry_fmt =
281+ ArrayFormatter :: try_new ( date64_value. as_ref ( ) , & format_options) ?;
282+ retry_fmt. value ( 0 ) . write ( & mut buffer) ?;
283+ builder. append_value ( & buffer) ;
305284 }
285+ Err ( e) => return Err ( e. into ( ) ) ,
306286 }
307287 }
308288
289+ let result = builder. finish ( ) ;
309290 match args[ 0 ] {
310- ColumnarValue :: Array ( _) => Ok ( ColumnarValue :: Array ( Arc :: new ( StringArray :: from (
311- results,
312- ) ) as ArrayRef ) ) ,
313- ColumnarValue :: Scalar ( _) => match results. first ( ) . unwrap ( ) {
314- Some ( value) => Ok ( ColumnarValue :: Scalar ( ScalarValue :: Utf8 ( Some (
315- value. to_string ( ) ,
316- ) ) ) ) ,
317- None => Ok ( ColumnarValue :: Scalar ( ScalarValue :: Utf8 ( None ) ) ) ,
318- } ,
291+ ColumnarValue :: Scalar ( _) => {
292+ let val = result. is_valid ( 0 ) . then ( || result. value ( 0 ) . to_string ( ) ) ;
293+ Ok ( ColumnarValue :: Scalar ( ScalarValue :: Utf8 ( val) ) )
294+ }
295+ ColumnarValue :: Array ( _) => Ok ( ColumnarValue :: Array ( Arc :: new ( result) as ArrayRef ) ) ,
319296 }
320297}
321298
322- fn retry_date_as_timestamp (
323- array_ref : & ArrayRef ,
324- format_options : & FormatOptions ,
325- ) -> Result < String > {
326- let target_data_type = Date64 ;
327-
328- let date_value = cast ( & array_ref, & target_data_type) ?;
329- let formatter = ArrayFormatter :: try_new ( date_value. as_ref ( ) , format_options) ?;
330- let result = formatter. value ( 0 ) . try_to_string ( ) ?;
331-
332- Ok ( result)
333- }
334-
335299#[ cfg( test) ]
336300mod tests {
337301 use crate :: datetime:: to_char:: ToCharFunc ;
0 commit comments