Skip to content

Commit 82b2c4a

Browse files
committed
perf: optimize trim functions to reuse pattern buffer
Replace per-row Vec<char> allocation with a reusable buffer in trim functions (ltrim, rtrim, btrim). The previous implementation allocated a Vec<char> for the pattern on every row, which was inefficient. This optimization introduces a pattern_buf that is allocated once and reused across all rows by clearing and refilling it. Changes: - Refactored general_trim to pass TrimType directly instead of closures - Created apply_trim helper function that accepts a mutable pattern buffer - Updated string_view_trim and string_trim to allocate pattern_buf once - Buffer is cleared and reused for each row to avoid repeated allocations Benchmark results for ltrim (size=1024): - INPUT LEN <= 12, string_view: 21.484 µs -> 13.243 µs (38.4% faster, 1.6x speedup) - INPUT LEN <= 12, string: 21.540 µs -> 14.051 µs (34.8% faster, 1.5x speedup) - INPUT LEN > 12, OUTPUT LEN > 12, string_view: 21.951 µs -> 13.325 µs (39.3% faster, 1.6x speedup) - INPUT LEN > 12, OUTPUT LEN > 12, string: 24.328 µs -> 16.844 µs (30.8% faster, 1.4x speedup) - INPUT LEN > 12, OUTPUT LEN <= 12, string_view: 87.967 µs -> 77.016 µs (12.4% faster) Benchmark results for ltrim (size=4096): - INPUT LEN <= 12, string_view: 85.626 µs -> 51.478 µs (39.9% faster, 1.7x speedup) - INPUT LEN <= 12, string: 84.011 µs -> 54.774 µs (34.8% faster, 1.5x speedup) - INPUT LEN > 12, OUTPUT LEN > 12, string_view: 85.964 µs -> 51.825 µs (39.7% faster, 1.7x speedup) - INPUT LEN > 12, OUTPUT LEN > 12, string: 102.42 µs -> 74.097 µs (27.7% faster, 1.4x speedup) The optimization shows consistent 28-40% improvement across most workloads by eliminating per-row Vec allocations. This applies to all trim variants (ltrim, rtrim, btrim) as they share the same underlying implementation.
1 parent 7c50448 commit 82b2c4a

File tree

1 file changed

+67
-72
lines changed

1 file changed

+67
-72
lines changed

datafusion/functions/src/string/common.rs

Lines changed: 67 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -54,75 +54,20 @@ pub(crate) fn general_trim<T: OffsetSizeTrait>(
5454
trim_type: TrimType,
5555
use_string_view: bool,
5656
) -> Result<ArrayRef> {
57-
let func = match trim_type {
58-
TrimType::Left => |input, pattern: &str| {
59-
let pattern = pattern.chars().collect::<Vec<char>>();
60-
let ltrimmed_str =
61-
str::trim_start_matches::<&[char]>(input, pattern.as_ref());
62-
// `ltrimmed_str` is actually `input`[start_offset..],
63-
// so `start_offset` = len(`input`) - len(`ltrimmed_str`)
64-
let start_offset = input.len() - ltrimmed_str.len();
65-
66-
(ltrimmed_str, start_offset as u32)
67-
},
68-
TrimType::Right => |input, pattern: &str| {
69-
let pattern = pattern.chars().collect::<Vec<char>>();
70-
let rtrimmed_str = str::trim_end_matches::<&[char]>(input, pattern.as_ref());
71-
72-
// `ltrimmed_str` is actually `input`[0..new_len], so `start_offset` is 0
73-
(rtrimmed_str, 0)
74-
},
75-
TrimType::Both => |input, pattern: &str| {
76-
let pattern = pattern.chars().collect::<Vec<char>>();
77-
let ltrimmed_str =
78-
str::trim_start_matches::<&[char]>(input, pattern.as_ref());
79-
// `btrimmed_str` can be got by rtrim(ltrim(`input`)),
80-
// so its `start_offset` should be same as ltrim situation above
81-
let start_offset = input.len() - ltrimmed_str.len();
82-
let btrimmed_str =
83-
str::trim_end_matches::<&[char]>(ltrimmed_str, pattern.as_ref());
84-
85-
(btrimmed_str, start_offset as u32)
86-
},
87-
};
88-
8957
if use_string_view {
90-
string_view_trim(func, args)
58+
string_view_trim(trim_type, args)
9159
} else {
92-
string_trim::<T>(func, args)
60+
string_trim::<T>(trim_type, args)
9361
}
9462
}
9563

9664
/// Applies the trim function to the given string view array(s)
9765
/// and returns a new string view array with the trimmed values.
98-
///
99-
/// # `trim_func`: The function to apply to each string view.
100-
///
101-
/// ## Arguments
102-
/// - The original string
103-
/// - the pattern to trim
104-
///
105-
/// ## Returns
106-
/// - trimmed str (must be a substring of the first argument)
107-
/// - start offset, needed in `string_view_trim`
108-
///
109-
/// ## Examples
110-
///
111-
/// For `ltrim`:
112-
/// - `fn(" abc", " ") -> ("abc", 2)`
113-
/// - `fn("abd", " ") -> ("abd", 0)`
114-
///
115-
/// For `btrim`:
116-
/// - `fn(" abc ", " ") -> ("abc", 2)`
117-
/// - `fn("abd", " ") -> ("abd", 0)`
118-
// removing 'a will cause compiler complaining lifetime of `func`
119-
fn string_view_trim<'a>(
120-
trim_func: fn(&'a str, &'a str) -> (&'a str, u32),
121-
args: &'a [ArrayRef],
122-
) -> Result<ArrayRef> {
66+
fn string_view_trim<'a>(trim_type: TrimType, args: &'a [ArrayRef]) -> Result<ArrayRef> {
12367
let string_view_array = as_string_view_array(&args[0])?;
12468
let mut views_buf = Vec::with_capacity(string_view_array.len());
12569
let mut null_builder = NullBufferBuilder::new(string_view_array.len());
70+
let mut pattern_buf = Vec::new();
12671

12772
match args.len() {
12873
1 => {
@@ -132,7 +77,8 @@ fn string_view_trim<'a>(
13277
trim_and_append_str(
13378
src_str_opt,
13479
Some(" "),
135-
trim_func,
80+
trim_type,
81+
&mut pattern_buf,
13682
&mut views_buf,
13783
&mut null_builder,
13884
raw_view,
@@ -159,7 +105,8 @@ fn string_view_trim<'a>(
159105
trim_and_append_str(
160106
src_str_opt,
161107
Some(characters),
162-
trim_func,
108+
trim_type,
109+
&mut pattern_buf,
163110
&mut views_buf,
164111
&mut null_builder,
165112
raw_view,
@@ -176,7 +123,8 @@ fn string_view_trim<'a>(
176123
trim_and_append_str(
177124
src_str_opt,
178125
characters_opt,
179-
trim_func,
126+
trim_type,
127+
&mut pattern_buf,
180128
&mut views_buf,
181129
&mut null_builder,
182130
raw_view,
@@ -211,26 +159,29 @@ fn string_view_trim<'a>(
211159
/// Trims the given string and appends the trimmed string to the views buffer
212160
/// and the null buffer.
213161
///
214-
/// Calls `trim_func` on the string value in `original_view`, for non_null
162+
/// Calls the trim function on the string value in `original_view`, for non_null
215163
/// values and appends the updated view to the views buffer / null_builder.
216164
///
217165
/// Arguments
218166
/// - `src_str_opt`: The original string value (represented by the view)
219167
/// - `trim_characters_opt`: The characters to trim from the string
220-
/// - `trim_func`: The function to apply to the string (see [`string_view_trim`] for details)
168+
/// - `trim_type`: The type of trim to apply (left, right, both)
169+
/// - `pattern_buf`: Reusable buffer for pattern characters
221170
/// - `views_buf`: The buffer to append the updated views to
222171
/// - `null_builder`: The buffer to append the null values to
223172
/// - `original_view`: The original view value (that contains src_str_opt)
224173
fn trim_and_append_str<'a>(
225174
src_str_opt: Option<&'a str>,
226175
trim_characters_opt: Option<&'a str>,
227-
trim_func: fn(&'a str, &'a str) -> (&'a str, u32),
176+
trim_type: TrimType,
177+
pattern_buf: &mut Vec<char>,
228178
views_buf: &mut Vec<u128>,
229179
null_builder: &mut NullBufferBuilder,
230180
original_view: &u128,
231181
) {
232182
if let (Some(src_str), Some(characters)) = (src_str_opt, trim_characters_opt) {
233-
let (trim_str, start_offset) = trim_func(src_str, characters);
183+
let (trim_str, start_offset) =
184+
apply_trim(src_str, characters, trim_type, pattern_buf);
234185
make_and_append_view(
235186
views_buf,
236187
null_builder,
@@ -244,21 +195,59 @@ fn trim_and_append_str<'a>(
244195
}
245196
}
246197

198+
/// Applies the appropriate trim operation based on trim_type
199+
/// Reuses pattern_buf to avoid allocating a Vec for each call
200+
fn apply_trim<'a>(
201+
input: &'a str,
202+
pattern: &str,
203+
trim_type: TrimType,
204+
pattern_buf: &mut Vec<char>,
205+
) -> (&'a str, u32) {
206+
// Reuse the buffer by clearing and refilling it
207+
pattern_buf.clear();
208+
pattern_buf.extend(pattern.chars());
209+
210+
match trim_type {
211+
TrimType::Left => {
212+
let ltrimmed_str =
213+
str::trim_start_matches::<&[char]>(input, pattern_buf.as_ref());
214+
let start_offset = input.len() - ltrimmed_str.len();
215+
(ltrimmed_str, start_offset as u32)
216+
}
217+
TrimType::Right => {
218+
let rtrimmed_str =
219+
str::trim_end_matches::<&[char]>(input, pattern_buf.as_ref());
220+
(rtrimmed_str, 0)
221+
}
222+
TrimType::Both => {
223+
let ltrimmed_str =
224+
str::trim_start_matches::<&[char]>(input, pattern_buf.as_ref());
225+
let start_offset = input.len() - ltrimmed_str.len();
226+
let btrimmed_str =
227+
str::trim_end_matches::<&[char]>(ltrimmed_str, pattern_buf.as_ref());
228+
(btrimmed_str, start_offset as u32)
229+
}
230+
}
231+
}
232+
247233
/// Applies the trim function to the given string array(s)
248234
/// and returns a new string array with the trimmed values.
249-
///
250-
/// See [`string_view_trim`] for details on `func`
251235
fn string_trim<'a, T: OffsetSizeTrait>(
252-
func: fn(&'a str, &'a str) -> (&'a str, u32),
236+
trim_type: TrimType,
253237
args: &'a [ArrayRef],
254238
) -> Result<ArrayRef> {
255239
let string_array = as_generic_string_array::<T>(&args[0])?;
240+
let mut pattern_buf = Vec::new();
256241

257242
match args.len() {
258243
1 => {
259244
let result = string_array
260245
.iter()
261-
.map(|string| string.map(|string: &str| func(string, " ").0))
246+
.map(|string| {
247+
string.map(|string: &str| {
248+
apply_trim(string, " ", trim_type, &mut pattern_buf).0
249+
})
250+
})
262251
.collect::<GenericStringArray<T>>();
263252

264253
Ok(Arc::new(result) as ArrayRef)
@@ -277,7 +266,11 @@ fn string_trim<'a, T: OffsetSizeTrait>(
277266
let characters = characters_array.value(0);
278267
let result = string_array
279268
.iter()
280-
.map(|item| item.map(|string| func(string, characters).0))
269+
.map(|item| {
270+
item.map(|string| {
271+
apply_trim(string, characters, trim_type, &mut pattern_buf).0
272+
})
273+
})
281274
.collect::<GenericStringArray<T>>();
282275
return Ok(Arc::new(result) as ArrayRef);
283276
}
@@ -286,7 +279,9 @@ fn string_trim<'a, T: OffsetSizeTrait>(
286279
.iter()
287280
.zip(characters_array.iter())
288281
.map(|(string, characters)| match (string, characters) {
289-
(Some(string), Some(characters)) => Some(func(string, characters).0),
282+
(Some(string), Some(characters)) => Some(
283+
apply_trim(string, characters, trim_type, &mut pattern_buf).0,
284+
),
290285
_ => None,
291286
})
292287
.collect::<GenericStringArray<T>>();

0 commit comments

Comments
 (0)