1818use std:: any:: Any ;
1919use std:: sync:: { Arc , OnceLock } ;
2020
21- use arrow:: array:: { ArrayRef , GenericStringArray , OffsetSizeTrait , StringArray } ;
21+ use arrow:: array:: {
22+ Array , ArrayRef , GenericStringBuilder , OffsetSizeTrait , StringViewBuilder ,
23+ } ;
2224use arrow:: datatypes:: DataType ;
2325
2426use crate :: utils:: { make_scalar_function, utf8_to_str_type} ;
@@ -74,7 +76,7 @@ impl ScalarUDFImpl for InitcapFunc {
7476 DataType :: LargeUtf8 => make_scalar_function ( initcap :: < i64 > , vec ! [ ] ) ( args) ,
7577 DataType :: Utf8View => make_scalar_function ( initcap_utf8view, vec ! [ ] ) ( args) ,
7678 other => {
77- exec_err ! ( "Unsupported data type {other:?} for function initcap" )
79+ exec_err ! ( "Unsupported data type {other:?} for function ` initcap` " )
7880 }
7981 }
8082 }
@@ -90,9 +92,8 @@ fn get_initcap_doc() -> &'static Documentation {
9092 DOCUMENTATION . get_or_init ( || {
9193 Documentation :: builder (
9294 DOC_SECTION_STRING ,
93- "Capitalizes the first character in each word in the ASCII input string. \
94- Words are delimited by non-alphanumeric characters.\n \n \
95- Note this function does not support UTF-8 characters.",
95+ "Capitalizes the first character in each word in the input string. \
96+ Words are delimited by non-alphanumeric characters.",
9697 "initcap(str)" ,
9798 )
9899 . with_sql_example (
@@ -123,50 +124,70 @@ fn get_initcap_doc() -> &'static Documentation {
123124fn initcap < T : OffsetSizeTrait > ( args : & [ ArrayRef ] ) -> Result < ArrayRef > {
124125 let string_array = as_generic_string_array :: < T > ( & args[ 0 ] ) ?;
125126
126- // first map is the iterator, second is for the `Option<_>`
127- let result = string_array
128- . iter ( )
129- . map ( initcap_string)
130- . collect :: < GenericStringArray < T > > ( ) ;
127+ let mut builder = GenericStringBuilder :: < T > :: with_capacity (
128+ string_array. len ( ) ,
129+ string_array. value_data ( ) . len ( ) ,
130+ ) ;
131131
132- Ok ( Arc :: new ( result) as ArrayRef )
132+ string_array. iter ( ) . for_each ( |str| match str {
133+ Some ( s) => {
134+ let initcap_str = initcap_string ( s) ;
135+ builder. append_value ( initcap_str) ;
136+ }
137+ None => builder. append_null ( ) ,
138+ } ) ;
139+
140+ Ok ( Arc :: new ( builder. finish ( ) ) as ArrayRef )
133141}
134142
135143fn initcap_utf8view ( args : & [ ArrayRef ] ) -> Result < ArrayRef > {
136144 let string_view_array = as_string_view_array ( & args[ 0 ] ) ?;
137145
138- let result = string_view_array
139- . iter ( )
140- . map ( initcap_string)
141- . collect :: < StringArray > ( ) ;
146+ let mut builder = StringViewBuilder :: with_capacity ( string_view_array. len ( ) ) ;
147+
148+ string_view_array. iter ( ) . for_each ( |str| match str {
149+ Some ( s) => {
150+ let initcap_str = initcap_string ( s) ;
151+ builder. append_value ( initcap_str) ;
152+ }
153+ None => builder. append_null ( ) ,
154+ } ) ;
142155
143- Ok ( Arc :: new ( result ) as ArrayRef )
156+ Ok ( Arc :: new ( builder . finish ( ) ) as ArrayRef )
144157}
145158
146- fn initcap_string ( input : Option < & str > ) -> Option < String > {
147- input. map ( |s| {
148- let mut result = String :: with_capacity ( s. len ( ) ) ;
149- let mut prev_is_alphanumeric = false ;
159+ fn initcap_string ( input : & str ) -> String {
160+ let mut result = String :: with_capacity ( input. len ( ) ) ;
161+ let mut prev_is_alphanumeric = false ;
150162
151- for c in s. chars ( ) {
152- let transformed = if prev_is_alphanumeric {
153- c. to_ascii_lowercase ( )
163+ if input. is_ascii ( ) {
164+ for c in input. chars ( ) {
165+ if prev_is_alphanumeric {
166+ result. push ( c. to_ascii_lowercase ( ) ) ;
154167 } else {
155- c. to_ascii_uppercase ( )
168+ result . push ( c. to_ascii_uppercase ( ) ) ;
156169 } ;
157- result. push ( transformed) ;
158170 prev_is_alphanumeric = c. is_ascii_alphanumeric ( ) ;
159171 }
172+ } else {
173+ for c in input. chars ( ) {
174+ if prev_is_alphanumeric {
175+ result. extend ( c. to_lowercase ( ) ) ;
176+ } else {
177+ result. extend ( c. to_uppercase ( ) ) ;
178+ }
179+ prev_is_alphanumeric = c. is_alphanumeric ( ) ;
180+ }
181+ }
160182
161- result
162- } )
183+ result
163184}
164185
165186#[ cfg( test) ]
166187mod tests {
167- use crate :: string :: initcap:: InitcapFunc ;
188+ use crate :: unicode :: initcap:: InitcapFunc ;
168189 use crate :: utils:: test:: test_function;
169- use arrow:: array:: { Array , StringArray } ;
190+ use arrow:: array:: { Array , StringArray , StringViewArray } ;
170191 use arrow:: datatypes:: DataType :: Utf8 ;
171192 use datafusion_common:: { Result , ScalarValue } ;
172193 use datafusion_expr:: { ColumnarValue , ScalarUDFImpl } ;
@@ -181,6 +202,19 @@ mod tests {
181202 Utf8 ,
182203 StringArray
183204 ) ;
205+ test_function ! (
206+ InitcapFunc :: new( ) ,
207+ vec![ ColumnarValue :: Scalar ( ScalarValue :: Utf8 ( Some (
208+ "êM ả ñAnDÚ ÁrBOL ОлЕГ ИвАНОВИч ÍslENsku ÞjóðaRiNNaR εΛλΗΝΙκΉ"
209+ . to_string( )
210+ ) ) ) ] ,
211+ Ok ( Some (
212+ "Êm Ả Ñandú Árbol Олег Иванович Íslensku Þjóðarinnar Ελληνική"
213+ ) ) ,
214+ & str ,
215+ Utf8 ,
216+ StringArray
217+ ) ;
184218 test_function ! (
185219 InitcapFunc :: new( ) ,
186220 vec![ ColumnarValue :: Scalar ( ScalarValue :: from( "" ) ) ] ,
@@ -205,6 +239,7 @@ mod tests {
205239 Utf8 ,
206240 StringArray
207241 ) ;
242+
208243 test_function ! (
209244 InitcapFunc :: new( ) ,
210245 vec![ ColumnarValue :: Scalar ( ScalarValue :: Utf8View ( Some (
@@ -213,7 +248,7 @@ mod tests {
213248 Ok ( Some ( "Hi Thomas" ) ) ,
214249 & str ,
215250 Utf8 ,
216- StringArray
251+ StringViewArray
217252 ) ;
218253 test_function ! (
219254 InitcapFunc :: new( ) ,
@@ -223,7 +258,20 @@ mod tests {
223258 Ok ( Some ( "Hi Thomas With M0re Than 12 Chars" ) ) ,
224259 & str ,
225260 Utf8 ,
226- StringArray
261+ StringViewArray
262+ ) ;
263+ test_function ! (
264+ InitcapFunc :: new( ) ,
265+ vec![ ColumnarValue :: Scalar ( ScalarValue :: Utf8View ( Some (
266+ "đẸp đẼ êM ả ñAnDÚ ÁrBOL ОлЕГ ИвАНОВИч ÍslENsku ÞjóðaRiNNaR εΛλΗΝΙκΉ"
267+ . to_string( )
268+ ) ) ) ] ,
269+ Ok ( Some (
270+ "Đẹp Đẽ Êm Ả Ñandú Árbol Олег Иванович Íslensku Þjóðarinnar Ελληνική"
271+ ) ) ,
272+ & str ,
273+ Utf8 ,
274+ StringViewArray
227275 ) ;
228276 test_function ! (
229277 InitcapFunc :: new( ) ,
@@ -233,15 +281,15 @@ mod tests {
233281 Ok ( Some ( "" ) ) ,
234282 & str ,
235283 Utf8 ,
236- StringArray
284+ StringViewArray
237285 ) ;
238286 test_function ! (
239287 InitcapFunc :: new( ) ,
240288 vec![ ColumnarValue :: Scalar ( ScalarValue :: Utf8View ( None ) ) ] ,
241289 Ok ( None ) ,
242290 & str ,
243291 Utf8 ,
244- StringArray
292+ StringViewArray
245293 ) ;
246294
247295 Ok ( ( ) )
0 commit comments