1818use std:: any:: Any ;
1919use std:: sync:: Arc ;
2020
21+ use arrow:: array:: builder:: StringBuilder ;
2122use arrow:: array:: cast:: AsArray ;
22- use arrow:: array:: { Array , ArrayRef , StringArray , new_null_array} ;
23+ use arrow:: array:: { Array , ArrayRef , new_null_array} ;
2324use arrow:: compute:: cast;
2425use arrow:: datatypes:: DataType ;
2526use arrow:: datatypes:: DataType :: {
2627 Date32 , Date64 , Duration , Time32 , Time64 , Timestamp , Utf8 ,
2728} ;
2829use arrow:: datatypes:: TimeUnit :: { Microsecond , Millisecond , Nanosecond , Second } ;
29- use arrow:: error:: ArrowError ;
3030use arrow:: util:: display:: { ArrayFormatter , DurationFormat , FormatOptions } ;
3131use datafusion_common:: { Result , ScalarValue , exec_err, utils:: take_function_args} ;
3232use datafusion_expr:: TypeSignature :: Exact ;
@@ -143,20 +143,15 @@ impl ScalarUDFImpl for ToCharFunc {
143143 let [ date_time, format] = take_function_args ( self . name ( ) , & args) ?;
144144
145145 match format {
146- ColumnarValue :: Scalar ( ScalarValue :: Utf8 ( None ) )
147- | ColumnarValue :: Scalar ( ScalarValue :: Null ) => to_char_scalar ( date_time, None ) ,
148- // constant format
149- ColumnarValue :: Scalar ( ScalarValue :: Utf8 ( Some ( format) ) ) => {
150- // invoke to_char_scalar with the known string, without converting to array
151- to_char_scalar ( date_time, Some ( format) )
146+ ColumnarValue :: Scalar ( ScalarValue :: Null ) => to_char_scalar ( date_time, None ) ,
147+ ColumnarValue :: Scalar ( ScalarValue :: Utf8 ( fmt) ) => {
148+ to_char_scalar ( date_time, fmt. as_deref ( ) )
152149 }
153150 ColumnarValue :: Array ( _) => to_char_array ( & args) ,
154- _ => {
155- exec_err ! (
156- "Format for `to_char` must be non-null Utf8, received {}" ,
157- format. data_type( )
158- )
159- }
151+ _ => exec_err ! (
152+ "Format for `to_char` must be non-null Utf8, received {}" ,
153+ format. data_type( )
154+ ) ,
160155 }
161156 }
162157
@@ -172,7 +167,7 @@ impl ScalarUDFImpl for ToCharFunc {
172167fn build_format_options < ' a > (
173168 data_type : & DataType ,
174169 format : Option < & ' a str > ,
175- ) -> Result < FormatOptions < ' a > , Result < ColumnarValue > > {
170+ ) -> Result < FormatOptions < ' a > > {
176171 let Some ( format) = format else {
177172 return Ok ( FormatOptions :: new ( ) ) ;
178173 } ;
@@ -194,24 +189,24 @@ fn build_format_options<'a>(
194189 } ,
195190 ) ,
196191 other => {
197- return Err ( exec_err ! (
192+ return exec_err ! (
198193 "to_char only supports date, time, timestamp and duration data types, received {other:?}"
199- ) ) ;
194+ ) ;
200195 }
201196 } ;
202197 Ok ( format_options)
203198}
204199
205- /// Special version when arg\[1] is a scalar
200+ /// Formats `expression` using a constant `format` string.
206201fn to_char_scalar (
207202 expression : & ColumnarValue ,
208203 format : Option < & str > ,
209204) -> Result < ColumnarValue > {
210- // it's possible that the expression is a scalar however because
211- // of the implementation in arrow-rs we need to convert it to an array
205+ // ArrayFormatter requires an array, so scalar expressions must be
206+ // converted to a 1-element array first.
212207 let data_type = & expression. data_type ( ) ;
213208 let is_scalar_expression = matches ! ( & expression, ColumnarValue :: Scalar ( _) ) ;
214- let array = expression. clone ( ) . into_array ( 1 ) ?;
209+ let array = expression. to_array ( 1 ) ?;
215210
216211 if format. is_none ( ) {
217212 return if is_scalar_expression {
@@ -221,117 +216,95 @@ fn to_char_scalar(
221216 } ;
222217 }
223218
224- let format_options = match build_format_options ( data_type, format) {
225- Ok ( value) => value,
226- Err ( value) => return value,
227- } ;
228-
219+ let format_options = build_format_options ( data_type, format) ?;
229220 let formatter = ArrayFormatter :: try_new ( array. as_ref ( ) , & format_options) ?;
230- let formatted: Result < Vec < Option < String > > , ArrowError > = ( 0 ..array. len ( ) )
231- . map ( |i| {
232- if array. is_null ( i) {
233- Ok ( None )
234- } else {
235- formatter. value ( i) . try_to_string ( ) . map ( Some )
236- }
237- } )
238- . collect ( ) ;
239-
240- if let Ok ( formatted) = formatted {
241- if is_scalar_expression {
242- Ok ( ColumnarValue :: Scalar ( ScalarValue :: Utf8 (
243- formatted. first ( ) . unwrap ( ) . clone ( ) ,
244- ) ) )
221+
222+ let fmt_len = format. map_or ( 20 , |f| f. len ( ) + 10 ) ;
223+ let mut builder = StringBuilder :: with_capacity ( array. len ( ) , array. len ( ) * fmt_len) ;
224+
225+ for i in 0 ..array. len ( ) {
226+ if array. is_null ( i) {
227+ builder. append_null ( ) ;
245228 } else {
246- Ok ( ColumnarValue :: Array (
247- Arc :: new ( StringArray :: from ( formatted) ) as ArrayRef
248- ) )
249- }
250- } else {
251- // if the data type was a Date32, formatting could have failed because the format string
252- // contained datetime specifiers, so we'll retry by casting the date array as a timestamp array
253- if data_type == & Date32 {
254- return to_char_scalar ( & expression. cast_to ( & Date64 , None ) ?, format) ;
229+ // Write directly into the builder's internal buffer, then
230+ // commit the value with append_value("").
231+ match formatter. value ( i) . write ( & mut builder) {
232+ Ok ( ( ) ) => builder. append_value ( "" ) ,
233+ // Arrow's Date32 formatter only handles date specifiers
234+ // (%Y, %m, %d, ...). Format strings with time specifiers
235+ // (%H, %M, %S, ...) cause it to fail. When this happens,
236+ // we retry by casting to Date64, whose datetime formatter
237+ // handles both date and time specifiers (with zero for
238+ // the time components).
239+ Err ( _) if data_type == & Date32 => {
240+ return to_char_scalar ( & expression. cast_to ( & Date64 , None ) ?, format) ;
241+ }
242+ Err ( e) => return Err ( e. into ( ) ) ,
243+ }
255244 }
245+ }
256246
257- exec_err ! ( "{}" , formatted. unwrap_err( ) )
247+ let result = builder. finish ( ) ;
248+ if is_scalar_expression {
249+ let val = result. is_valid ( 0 ) . then ( || result. value ( 0 ) . to_string ( ) ) ;
250+ Ok ( ColumnarValue :: Scalar ( ScalarValue :: Utf8 ( val) ) )
251+ } else {
252+ Ok ( ColumnarValue :: Array ( Arc :: new ( result) as ArrayRef ) )
258253 }
259254}
260255
261256fn to_char_array ( args : & [ ColumnarValue ] ) -> Result < ColumnarValue > {
262257 let arrays = ColumnarValue :: values_to_arrays ( args) ?;
263- let mut results : Vec < Option < String > > = vec ! [ ] ;
258+ let data_array = & arrays [ 0 ] ;
264259 let format_array = arrays[ 1 ] . as_string :: < i32 > ( ) ;
265- let data_type = arrays [ 0 ] . data_type ( ) ;
260+ let data_type = data_array . data_type ( ) ;
266261
267- for idx in 0 ..arrays [ 0 ] . len ( ) {
268- let format = if format_array . is_null ( idx ) {
269- None
270- } else {
271- Some ( format_array . value ( idx ) )
272- } ;
273- if format . is_none ( ) {
274- results . push ( None ) ;
262+ let fmt_len = 30 ;
263+ let mut builder =
264+ StringBuilder :: with_capacity ( data_array . len ( ) , data_array . len ( ) * fmt_len ) ;
265+ let mut buffer = String :: with_capacity ( fmt_len ) ;
266+
267+ for idx in 0 ..data_array . len ( ) {
268+ if format_array . is_null ( idx ) || data_array . is_null ( idx ) {
269+ builder . append_null ( ) ;
275270 continue ;
276271 }
277- let format_options = match build_format_options ( data_type, format) {
278- Ok ( value) => value,
279- Err ( value) => return value,
280- } ;
281- // this isn't ideal but this can't use ValueFormatter as it isn't independent
282- // from ArrayFormatter
283- let formatter = ArrayFormatter :: try_new ( arrays[ 0 ] . as_ref ( ) , & format_options) ?;
284- let result = formatter. value ( idx) . try_to_string ( ) ;
285- match result {
286- Ok ( value) => results. push ( Some ( value) ) ,
287- Err ( e) => {
288- // if the data type was a Date32, formatting could have failed because the format string
289- // contained datetime specifiers, so we'll treat this specific date element as a timestamp
290- if data_type == & Date32 {
291- let failed_date_value = arrays[ 0 ] . slice ( idx, 1 ) ;
292-
293- match retry_date_as_timestamp ( & failed_date_value, & format_options) {
294- Ok ( value) => {
295- results. push ( Some ( value) ) ;
296- continue ;
297- }
298- Err ( e) => {
299- return exec_err ! ( "{}" , e) ;
300- }
301- }
302- }
303272
304- return exec_err ! ( "{}" , e) ;
273+ let format = Some ( format_array. value ( idx) ) ;
274+ let format_options = build_format_options ( data_type, format) ?;
275+ let formatter = ArrayFormatter :: try_new ( data_array. as_ref ( ) , & format_options) ?;
276+
277+ buffer. clear ( ) ;
278+
279+ // We'd prefer to write directly to the StringBuilder's internal buffer,
280+ // but the write might fail, and there's no easy way to ensure a partial
281+ // write is removed from the buffer. So instead we write to a temporary
282+ // buffer and `append_value` on success.
283+ match formatter. value ( idx) . write ( & mut buffer) {
284+ Ok ( ( ) ) => builder. append_value ( & buffer) ,
285+ // Retry with Date64 (see comment in to_char_scalar).
286+ Err ( _) if data_type == & Date32 => {
287+ buffer. clear ( ) ;
288+ let date64_value = cast ( & data_array. slice ( idx, 1 ) , & Date64 ) ?;
289+ let retry_fmt =
290+ ArrayFormatter :: try_new ( date64_value. as_ref ( ) , & format_options) ?;
291+ retry_fmt. value ( 0 ) . write ( & mut buffer) ?;
292+ builder. append_value ( & buffer) ;
305293 }
294+ Err ( e) => return Err ( e. into ( ) ) ,
306295 }
307296 }
308297
298+ let result = builder. finish ( ) ;
309299 match args[ 0 ] {
310- ColumnarValue :: Array ( _) => Ok ( ColumnarValue :: Array ( Arc :: new ( StringArray :: from (
311- results,
312- ) ) as ArrayRef ) ) ,
313- ColumnarValue :: Scalar ( _) => match results. first ( ) . unwrap ( ) {
314- Some ( value) => Ok ( ColumnarValue :: Scalar ( ScalarValue :: Utf8 ( Some (
315- value. to_string ( ) ,
316- ) ) ) ) ,
317- None => Ok ( ColumnarValue :: Scalar ( ScalarValue :: Utf8 ( None ) ) ) ,
318- } ,
300+ ColumnarValue :: Scalar ( _) => {
301+ let val = result. is_valid ( 0 ) . then ( || result. value ( 0 ) . to_string ( ) ) ;
302+ Ok ( ColumnarValue :: Scalar ( ScalarValue :: Utf8 ( val) ) )
303+ }
304+ ColumnarValue :: Array ( _) => Ok ( ColumnarValue :: Array ( Arc :: new ( result) as ArrayRef ) ) ,
319305 }
320306}
321307
322- fn retry_date_as_timestamp (
323- array_ref : & ArrayRef ,
324- format_options : & FormatOptions ,
325- ) -> Result < String > {
326- let target_data_type = Date64 ;
327-
328- let date_value = cast ( & array_ref, & target_data_type) ?;
329- let formatter = ArrayFormatter :: try_new ( date_value. as_ref ( ) , format_options) ?;
330- let result = formatter. value ( 0 ) . try_to_string ( ) ?;
331-
332- Ok ( result)
333- }
334-
335308#[ cfg( test) ]
336309mod tests {
337310 use crate :: datetime:: to_char:: ToCharFunc ;
0 commit comments