Skip to content
Merged
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 43 additions & 26 deletions native/spark-expr/src/conversion_funcs/cast.rs
Original file line number Diff line number Diff line change
Expand Up @@ -389,13 +389,23 @@ macro_rules! cast_utf8_to_int {
($array:expr, $eval_mode:expr, $array_type:ty, $cast_method:ident) => {{
let len = $array.len();
let mut cast_array = PrimitiveArray::<$array_type>::builder(len);
for i in 0..len {
if $array.is_null(i) {
cast_array.append_null()
} else if let Some(cast_value) = $cast_method($array.value(i), $eval_mode)? {
cast_array.append_value(cast_value);
} else {
cast_array.append_null()
if $array.null_count() == 0 {
for i in 0..len {
if let Some(cast_value) = $cast_method($array.value(i), $eval_mode)? {
cast_array.append_value(cast_value);
} else {
cast_array.append_null()
}
}
} else {
for i in 0..len {
if $array.is_null(i) {
cast_array.append_null()
} else if let Some(cast_value) = $cast_method($array.value(i), $eval_mode)? {
cast_array.append_value(cast_value);
} else {
cast_array.append_null()
}
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

made null check conditional to remove unwanted branching

}
}
let result: SparkResult<ArrayRef> = Ok(Arc::new(cast_array.finish()) as ArrayRef);
Expand Down Expand Up @@ -1965,33 +1975,41 @@ fn do_cast_string_to_int<
type_name: &str,
min_value: T,
) -> SparkResult<Option<T>> {
let trimmed_str = str.trim();
if trimmed_str.is_empty() {
let bytes = str.as_bytes();
let mut start = 0;
let mut end = bytes.len();

while start < end && bytes[start].is_ascii_whitespace() {
start += 1;
}
while end > start && bytes[end - 1].is_ascii_whitespace() {
end -= 1;
}
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

not creating a new string through trim function and just looping through whitespaces

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was actually wrong in an earlier review when I suggested stopping using trim. trim does just return a slice on a &str, not a new String.

It may be worth considering using trim_ascii instead.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I will look at using trim_ascii in a separate PR along with some other minor changes.


if start == end {
return none_or_err(eval_mode, type_name, str);
}
let trimmed_str = &str[start..end];
let len = trimmed_str.len();
let trimmed_bytes = trimmed_str.as_bytes();
let mut result: T = T::zero();
let mut negative = false;
let mut idx = 0;
let first_char = trimmed_bytes[0];
let negative = first_char == b'-';
if negative || first_char == b'+' {
idx = 1;
if len == 1 {
return none_or_err(eval_mode, type_name, str);
}
}
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same (removed unwanted If branching)


let radix = T::from(10);
let stop_value = min_value / radix;
let mut parse_sign_and_digits = true;

for (i, ch) in trimmed_str.char_indices() {
for &ch in &trimmed_bytes[idx..] {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Cleaner and faster approach to access the chars directly

if parse_sign_and_digits {
if i == 0 {
negative = ch == '-';
let positive = ch == '+';
if negative || positive {
if i + 1 == len {
// input string is just "+" or "-"
return none_or_err(eval_mode, type_name, str);
}
// consume this char
continue;
}
}

if ch == '.' {
if ch == b'.' {
if eval_mode == EvalMode::Legacy {
// truncate decimal in legacy mode
parse_sign_and_digits = false;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The eval_mode does not change for different rows. It would likely be more performant to have separate implementations for legacy vs other modes to avoid the conditional in the hot loop.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Great suggestion . Let me go ahead and make separate paths for each eval mode to prevent hot loops and update this thread with benchmarks

Expand All @@ -2014,7 +2032,6 @@ fn do_cast_string_to_int<
if result < stop_value {
return none_or_err(eval_mode, type_name, str);
}

// Since the previous result is greater than or equal to stopValue(Integer.MIN_VALUE /
// radix), we can just use `result > 0` to check overflow. If result
// overflows, we should stop
Expand Down
Loading