Skip to content

Commit a267784

Browse files
authored
Support unicode character for initcap function (#13752)
* Support unicode character for 'initcap' function Signed-off-by: Tai Le Manh <[email protected]> * Update unit tests * Fix clippy warning * Update sqllogictests - initcap * Update scalar_functions.md docs * Add suggestions change Signed-off-by: Tai Le Manh <[email protected]> --------- Signed-off-by: Tai Le Manh <[email protected]>
1 parent 2639fe0 commit a267784

File tree

7 files changed

+93
-47
lines changed

7 files changed

+93
-47
lines changed

datafusion/functions/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -212,4 +212,4 @@ required-features = ["math_expressions"]
212212
[[bench]]
213213
harness = false
214214
name = "initcap"
215-
required-features = ["string_expressions"]
215+
required-features = ["unicode_expressions"]

datafusion/functions/benches/initcap.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ use arrow::util::bench_util::{
2424
};
2525
use criterion::{black_box, criterion_group, criterion_main, Criterion};
2626
use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
27-
use datafusion_functions::string;
27+
use datafusion_functions::unicode;
2828
use std::sync::Arc;
2929

3030
fn create_args<O: OffsetSizeTrait>(
@@ -46,7 +46,7 @@ fn create_args<O: OffsetSizeTrait>(
4646
}
4747

4848
fn criterion_benchmark(c: &mut Criterion) {
49-
let initcap = string::initcap();
49+
let initcap = unicode::initcap();
5050
for size in [1024, 4096] {
5151
let args = create_args::<i32>(size, 8, true);
5252
c.bench_function(

datafusion/functions/src/string/mod.rs

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,6 @@ pub mod concat;
3030
pub mod concat_ws;
3131
pub mod contains;
3232
pub mod ends_with;
33-
pub mod initcap;
3433
pub mod levenshtein;
3534
pub mod lower;
3635
pub mod ltrim;
@@ -52,7 +51,6 @@ make_udf_function!(chr::ChrFunc, chr);
5251
make_udf_function!(concat::ConcatFunc, concat);
5352
make_udf_function!(concat_ws::ConcatWsFunc, concat_ws);
5453
make_udf_function!(ends_with::EndsWithFunc, ends_with);
55-
make_udf_function!(initcap::InitcapFunc, initcap);
5654
make_udf_function!(levenshtein::LevenshteinFunc, levenshtein);
5755
make_udf_function!(ltrim::LtrimFunc, ltrim);
5856
make_udf_function!(lower::LowerFunc, lower);
@@ -94,10 +92,6 @@ pub mod expr_fn {
9492
ends_with,
9593
"Returns true if the `string` ends with the `suffix`, false otherwise.",
9694
string suffix
97-
),(
98-
initcap,
99-
"Converts the first letter of each word in `string` in uppercase and the remaining characters in lowercase",
100-
string
10195
),(
10296
levenshtein,
10397
"Returns the Levenshtein distance between the two given strings",
@@ -177,7 +171,6 @@ pub fn functions() -> Vec<Arc<ScalarUDF>> {
177171
concat(),
178172
concat_ws(),
179173
ends_with(),
180-
initcap(),
181174
levenshtein(),
182175
lower(),
183176
ltrim(),

datafusion/functions/src/string/initcap.rs renamed to datafusion/functions/src/unicode/initcap.rs

Lines changed: 81 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,9 @@
1818
use std::any::Any;
1919
use std::sync::{Arc, OnceLock};
2020

21-
use arrow::array::{ArrayRef, GenericStringArray, OffsetSizeTrait, StringArray};
21+
use arrow::array::{
22+
Array, ArrayRef, GenericStringBuilder, OffsetSizeTrait, StringViewBuilder,
23+
};
2224
use arrow::datatypes::DataType;
2325

2426
use crate::utils::{make_scalar_function, utf8_to_str_type};
@@ -74,7 +76,7 @@ impl ScalarUDFImpl for InitcapFunc {
7476
DataType::LargeUtf8 => make_scalar_function(initcap::<i64>, vec![])(args),
7577
DataType::Utf8View => make_scalar_function(initcap_utf8view, vec![])(args),
7678
other => {
77-
exec_err!("Unsupported data type {other:?} for function initcap")
79+
exec_err!("Unsupported data type {other:?} for function `initcap`")
7880
}
7981
}
8082
}
@@ -90,9 +92,8 @@ fn get_initcap_doc() -> &'static Documentation {
9092
DOCUMENTATION.get_or_init(|| {
9193
Documentation::builder(
9294
DOC_SECTION_STRING,
93-
"Capitalizes the first character in each word in the ASCII input string. \
94-
Words are delimited by non-alphanumeric characters.\n\n\
95-
Note this function does not support UTF-8 characters.",
95+
"Capitalizes the first character in each word in the input string. \
96+
Words are delimited by non-alphanumeric characters.",
9697
"initcap(str)",
9798
)
9899
.with_sql_example(
@@ -123,50 +124,70 @@ fn get_initcap_doc() -> &'static Documentation {
123124
fn initcap<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
124125
let string_array = as_generic_string_array::<T>(&args[0])?;
125126

126-
// first map is the iterator, second is for the `Option<_>`
127-
let result = string_array
128-
.iter()
129-
.map(initcap_string)
130-
.collect::<GenericStringArray<T>>();
127+
let mut builder = GenericStringBuilder::<T>::with_capacity(
128+
string_array.len(),
129+
string_array.value_data().len(),
130+
);
131131

132-
Ok(Arc::new(result) as ArrayRef)
132+
string_array.iter().for_each(|str| match str {
133+
Some(s) => {
134+
let initcap_str = initcap_string(s);
135+
builder.append_value(initcap_str);
136+
}
137+
None => builder.append_null(),
138+
});
139+
140+
Ok(Arc::new(builder.finish()) as ArrayRef)
133141
}
134142

135143
fn initcap_utf8view(args: &[ArrayRef]) -> Result<ArrayRef> {
136144
let string_view_array = as_string_view_array(&args[0])?;
137145

138-
let result = string_view_array
139-
.iter()
140-
.map(initcap_string)
141-
.collect::<StringArray>();
146+
let mut builder = StringViewBuilder::with_capacity(string_view_array.len());
147+
148+
string_view_array.iter().for_each(|str| match str {
149+
Some(s) => {
150+
let initcap_str = initcap_string(s);
151+
builder.append_value(initcap_str);
152+
}
153+
None => builder.append_null(),
154+
});
142155

143-
Ok(Arc::new(result) as ArrayRef)
156+
Ok(Arc::new(builder.finish()) as ArrayRef)
144157
}
145158

146-
fn initcap_string(input: Option<&str>) -> Option<String> {
147-
input.map(|s| {
148-
let mut result = String::with_capacity(s.len());
149-
let mut prev_is_alphanumeric = false;
159+
fn initcap_string(input: &str) -> String {
160+
let mut result = String::with_capacity(input.len());
161+
let mut prev_is_alphanumeric = false;
150162

151-
for c in s.chars() {
152-
let transformed = if prev_is_alphanumeric {
153-
c.to_ascii_lowercase()
163+
if input.is_ascii() {
164+
for c in input.chars() {
165+
if prev_is_alphanumeric {
166+
result.push(c.to_ascii_lowercase());
154167
} else {
155-
c.to_ascii_uppercase()
168+
result.push(c.to_ascii_uppercase());
156169
};
157-
result.push(transformed);
158170
prev_is_alphanumeric = c.is_ascii_alphanumeric();
159171
}
172+
} else {
173+
for c in input.chars() {
174+
if prev_is_alphanumeric {
175+
result.extend(c.to_lowercase());
176+
} else {
177+
result.extend(c.to_uppercase());
178+
}
179+
prev_is_alphanumeric = c.is_alphanumeric();
180+
}
181+
}
160182

161-
result
162-
})
183+
result
163184
}
164185

165186
#[cfg(test)]
166187
mod tests {
167-
use crate::string::initcap::InitcapFunc;
188+
use crate::unicode::initcap::InitcapFunc;
168189
use crate::utils::test::test_function;
169-
use arrow::array::{Array, StringArray};
190+
use arrow::array::{Array, StringArray, StringViewArray};
170191
use arrow::datatypes::DataType::Utf8;
171192
use datafusion_common::{Result, ScalarValue};
172193
use datafusion_expr::{ColumnarValue, ScalarUDFImpl};
@@ -181,6 +202,19 @@ mod tests {
181202
Utf8,
182203
StringArray
183204
);
205+
test_function!(
206+
InitcapFunc::new(),
207+
vec![ColumnarValue::Scalar(ScalarValue::Utf8(Some(
208+
"êM ả ñAnDÚ ÁrBOL ОлЕГ ИвАНОВИч ÍslENsku ÞjóðaRiNNaR εΛλΗΝΙκΉ"
209+
.to_string()
210+
)))],
211+
Ok(Some(
212+
"Êm Ả Ñandú Árbol Олег Иванович Íslensku Þjóðarinnar Ελληνική"
213+
)),
214+
&str,
215+
Utf8,
216+
StringArray
217+
);
184218
test_function!(
185219
InitcapFunc::new(),
186220
vec![ColumnarValue::Scalar(ScalarValue::from(""))],
@@ -205,6 +239,7 @@ mod tests {
205239
Utf8,
206240
StringArray
207241
);
242+
208243
test_function!(
209244
InitcapFunc::new(),
210245
vec![ColumnarValue::Scalar(ScalarValue::Utf8View(Some(
@@ -213,7 +248,7 @@ mod tests {
213248
Ok(Some("Hi Thomas")),
214249
&str,
215250
Utf8,
216-
StringArray
251+
StringViewArray
217252
);
218253
test_function!(
219254
InitcapFunc::new(),
@@ -223,7 +258,20 @@ mod tests {
223258
Ok(Some("Hi Thomas With M0re Than 12 Chars")),
224259
&str,
225260
Utf8,
226-
StringArray
261+
StringViewArray
262+
);
263+
test_function!(
264+
InitcapFunc::new(),
265+
vec![ColumnarValue::Scalar(ScalarValue::Utf8View(Some(
266+
"đẸp đẼ êM ả ñAnDÚ ÁrBOL ОлЕГ ИвАНОВИч ÍslENsku ÞjóðaRiNNaR εΛλΗΝΙκΉ"
267+
.to_string()
268+
)))],
269+
Ok(Some(
270+
"Đẹp Đẽ Êm Ả Ñandú Árbol Олег Иванович Íslensku Þjóðarinnar Ελληνική"
271+
)),
272+
&str,
273+
Utf8,
274+
StringViewArray
227275
);
228276
test_function!(
229277
InitcapFunc::new(),
@@ -233,15 +281,15 @@ mod tests {
233281
Ok(Some("")),
234282
&str,
235283
Utf8,
236-
StringArray
284+
StringViewArray
237285
);
238286
test_function!(
239287
InitcapFunc::new(),
240288
vec![ColumnarValue::Scalar(ScalarValue::Utf8View(None))],
241289
Ok(None),
242290
&str,
243291
Utf8,
244-
StringArray
292+
StringViewArray
245293
);
246294

247295
Ok(())

datafusion/functions/src/unicode/mod.rs

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ use datafusion_expr::ScalarUDF;
2323

2424
pub mod character_length;
2525
pub mod find_in_set;
26+
pub mod initcap;
2627
pub mod left;
2728
pub mod lpad;
2829
pub mod reverse;
@@ -36,6 +37,7 @@ pub mod translate;
3637
// create UDFs
3738
make_udf_function!(character_length::CharacterLengthFunc, character_length);
3839
make_udf_function!(find_in_set::FindInSetFunc, find_in_set);
40+
make_udf_function!(initcap::InitcapFunc, initcap);
3941
make_udf_function!(left::LeftFunc, left);
4042
make_udf_function!(lpad::LPadFunc, lpad);
4143
make_udf_function!(right::RightFunc, right);
@@ -94,6 +96,10 @@ pub mod expr_fn {
9496
left,
9597
"returns the first `n` characters in the `string`",
9698
string n
99+
),(
100+
initcap,
101+
"converts the first letter of each word in `string` in uppercase and the remaining characters in lowercase",
102+
string
97103
),(
98104
find_in_set,
99105
"Returns a value in the range of 1 to N if the string str is in the string list strlist consisting of N substrings",
@@ -126,6 +132,7 @@ pub fn functions() -> Vec<Arc<ScalarUDF>> {
126132
vec![
127133
character_length(),
128134
find_in_set(),
135+
initcap(),
129136
left(),
130137
lpad(),
131138
reverse(),

datafusion/sqllogictest/test_files/string/string_query.slt.part

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -460,7 +460,7 @@ Andrew Datafusion📊🔥
460460
Xiangpeng Datafusion数据融合
461461
Raphael Datafusionдатафусион
462462
Under_Score Un Iść Core
463-
Percent Pan Tadeusz Ma Iść W KąT
463+
Percent Pan Tadeusz Ma Iść W Kąt
464464
(empty) (empty)
465465
(empty) (empty)
466466
% (empty)

docs/source/user-guide/sql/scalar_functions.md

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1070,9 +1070,7 @@ find_in_set(str, strlist)
10701070

10711071
### `initcap`
10721072

1073-
Capitalizes the first character in each word in the ASCII input string. Words are delimited by non-alphanumeric characters.
1074-
1075-
Note this function does not support UTF-8 characters.
1073+
Capitalizes the first character in each word in the input string. Words are delimited by non-alphanumeric characters.
10761074

10771075
```
10781076
initcap(str)

0 commit comments

Comments
 (0)