-
Notifications
You must be signed in to change notification settings - Fork 272
perf: Improve performance of CAST from string to int #3017
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 4 commits
42a6d64
768a017
ca90587
2b331f0
5abce5f
94d0f32
f1299eb
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -389,13 +389,23 @@ macro_rules! cast_utf8_to_int { | |
| ($array:expr, $eval_mode:expr, $array_type:ty, $cast_method:ident) => {{ | ||
| let len = $array.len(); | ||
| let mut cast_array = PrimitiveArray::<$array_type>::builder(len); | ||
| for i in 0..len { | ||
| if $array.is_null(i) { | ||
| cast_array.append_null() | ||
| } else if let Some(cast_value) = $cast_method($array.value(i), $eval_mode)? { | ||
| cast_array.append_value(cast_value); | ||
| } else { | ||
| cast_array.append_null() | ||
| if $array.null_count() == 0 { | ||
| for i in 0..len { | ||
| if let Some(cast_value) = $cast_method($array.value(i), $eval_mode)? { | ||
| cast_array.append_value(cast_value); | ||
| } else { | ||
| cast_array.append_null() | ||
| } | ||
| } | ||
| } else { | ||
| for i in 0..len { | ||
| if $array.is_null(i) { | ||
| cast_array.append_null() | ||
| } else if let Some(cast_value) = $cast_method($array.value(i), $eval_mode)? { | ||
| cast_array.append_value(cast_value); | ||
| } else { | ||
| cast_array.append_null() | ||
| } | ||
| } | ||
| } | ||
| let result: SparkResult<ArrayRef> = Ok(Arc::new(cast_array.finish()) as ArrayRef); | ||
|
|
@@ -1965,33 +1975,41 @@ fn do_cast_string_to_int< | |
| type_name: &str, | ||
| min_value: T, | ||
| ) -> SparkResult<Option<T>> { | ||
| let trimmed_str = str.trim(); | ||
| if trimmed_str.is_empty() { | ||
| let bytes = str.as_bytes(); | ||
| let mut start = 0; | ||
| let mut end = bytes.len(); | ||
|
|
||
| while start < end && bytes[start].is_ascii_whitespace() { | ||
| start += 1; | ||
| } | ||
| while end > start && bytes[end - 1].is_ascii_whitespace() { | ||
| end -= 1; | ||
| } | ||
|
||
|
|
||
| if start == end { | ||
| return none_or_err(eval_mode, type_name, str); | ||
| } | ||
| let trimmed_str = &str[start..end]; | ||
| let len = trimmed_str.len(); | ||
| let trimmed_bytes = trimmed_str.as_bytes(); | ||
| let mut result: T = T::zero(); | ||
| let mut negative = false; | ||
| let mut idx = 0; | ||
| let first_char = trimmed_bytes[0]; | ||
| let negative = first_char == b'-'; | ||
| if negative || first_char == b'+' { | ||
| idx = 1; | ||
| if len == 1 { | ||
| return none_or_err(eval_mode, type_name, str); | ||
| } | ||
| } | ||
|
||
|
|
||
| let radix = T::from(10); | ||
| let stop_value = min_value / radix; | ||
| let mut parse_sign_and_digits = true; | ||
|
|
||
| for (i, ch) in trimmed_str.char_indices() { | ||
| for &ch in &trimmed_bytes[idx..] { | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Cleaner and faster approach to access the chars directly |
||
| if parse_sign_and_digits { | ||
| if i == 0 { | ||
| negative = ch == '-'; | ||
| let positive = ch == '+'; | ||
| if negative || positive { | ||
| if i + 1 == len { | ||
| // input string is just "+" or "-" | ||
| return none_or_err(eval_mode, type_name, str); | ||
| } | ||
| // consume this char | ||
| continue; | ||
| } | ||
| } | ||
|
|
||
| if ch == '.' { | ||
| if ch == b'.' { | ||
| if eval_mode == EvalMode::Legacy { | ||
| // truncate decimal in legacy mode | ||
| parse_sign_and_digits = false; | ||
|
||
|
|
@@ -2014,7 +2032,6 @@ fn do_cast_string_to_int< | |
| if result < stop_value { | ||
| return none_or_err(eval_mode, type_name, str); | ||
| } | ||
|
|
||
| // Since the previous result is greater than or equal to stopValue(Integer.MIN_VALUE / | ||
| // radix), we can just use `result > 0` to check overflow. If result | ||
| // overflows, we should stop | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
made null check conditional to remove unwanted branching