Skip to content

Commit 8f465d6

Browse files
committed
perf: optimize regexp_count to avoid String allocation when start position is provided
Replace `.chars().skip().collect::<String>()` with zero-copy string slicing using `char_indices()` to find the byte offset, then slice with `&value[byte_offset..]`. This eliminates unnecessary String allocation per row when a start position is specified. Changes: - Use char_indices().nth() to find byte offset for start position (1-based) - Use string slicing &value[byte_offset..] instead of collecting chars - Added benchmark to measure performance improvements Optimization: - Before: Allocated new String via .collect() for each row with start position - After: Uses zero-copy string slice Benchmark results: - size=1024, str_len=32: 96.361 µs -> 41.458 µs (57.0% faster, 2.3x speedup) - size=1024, str_len=128: 210.16 µs -> 56.064 µs (73.3% faster, 3.7x speedup) - size=4096, str_len=32: 376.90 µs -> 162.98 µs (56.8% faster, 2.3x speedup) - size=4096, str_len=128: 855.68 µs -> 263.61 µs (69.2% faster, 3.2x speedup) The optimization shows greater improvements for longer strings (up to 73% faster) since string slicing is O(1) regardless of length, while the previous approach had allocation costs that grew with string length.
1 parent 7c50448 commit 8f465d6

File tree

3 files changed

+116
-2
lines changed

3 files changed

+116
-2
lines changed

datafusion/functions/Cargo.toml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -270,6 +270,11 @@ harness = false
270270
name = "ends_with"
271271
required-features = ["string_expressions"]
272272

273+
[[bench]]
274+
harness = false
275+
name = "regexp_count"
276+
required-features = ["regex_expressions"]
277+
273278
[[bench]]
274279
harness = false
275280
name = "translate"
Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
extern crate criterion;
2+
3+
use arrow::array::Int64Array;
4+
use arrow::array::OffsetSizeTrait;
5+
use arrow::datatypes::{DataType, Field};
6+
use arrow::util::bench_util::create_string_array_with_len;
7+
use criterion::{Criterion, SamplingMode, criterion_group, criterion_main};
8+
use datafusion_common::config::ConfigOptions;
9+
use datafusion_common::{DataFusionError, ScalarValue};
10+
use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
11+
use datafusion_functions::regex;
12+
use std::hint::black_box;
13+
use std::sync::Arc;
14+
use std::time::Duration;
15+
16+
fn create_args<O: OffsetSizeTrait>(
17+
size: usize,
18+
str_len: usize,
19+
with_start: bool,
20+
) -> Vec<ColumnarValue> {
21+
let string_array = Arc::new(create_string_array_with_len::<O>(size, 0.1, str_len));
22+
23+
// Use a simple pattern that matches common characters
24+
let pattern = ColumnarValue::Scalar(ScalarValue::Utf8(Some("a".to_string())));
25+
26+
if with_start {
27+
// Test with start position (this is where the optimization matters)
28+
let start_array = Arc::new(Int64Array::from(
29+
(0..size).map(|i| (i % 10 + 1) as i64).collect::<Vec<_>>(),
30+
));
31+
vec![
32+
ColumnarValue::Array(string_array),
33+
pattern,
34+
ColumnarValue::Array(start_array),
35+
]
36+
} else {
37+
vec![ColumnarValue::Array(string_array), pattern]
38+
}
39+
}
40+
41+
fn invoke_regexp_count_with_args(
42+
args: Vec<ColumnarValue>,
43+
number_rows: usize,
44+
) -> Result<ColumnarValue, DataFusionError> {
45+
let arg_fields = args
46+
.iter()
47+
.enumerate()
48+
.map(|(idx, arg)| Field::new(format!("arg_{idx}"), arg.data_type(), true).into())
49+
.collect::<Vec<_>>();
50+
let config_options = Arc::new(ConfigOptions::default());
51+
52+
regex::regexp_count().invoke_with_args(ScalarFunctionArgs {
53+
args,
54+
arg_fields,
55+
number_rows,
56+
return_field: Field::new("f", DataType::Int64, true).into(),
57+
config_options: Arc::clone(&config_options),
58+
})
59+
}
60+
61+
fn criterion_benchmark(c: &mut Criterion) {
62+
for size in [1024, 4096] {
63+
let mut group = c.benchmark_group(format!("regexp_count size={size}"));
64+
group.sampling_mode(SamplingMode::Flat);
65+
group.sample_size(10);
66+
group.measurement_time(Duration::from_secs(10));
67+
68+
// Test without start position (no optimization impact)
69+
for str_len in [32, 128] {
70+
let args = create_args::<i32>(size, str_len, false);
71+
group.bench_function(
72+
format!("regexp_count_no_start [size={size}, str_len={str_len}]"),
73+
|b| {
74+
b.iter(|| {
75+
let args_cloned = args.clone();
76+
black_box(invoke_regexp_count_with_args(args_cloned, size))
77+
})
78+
},
79+
);
80+
}
81+
82+
// Test with start position (optimization should help here)
83+
for str_len in [32, 128] {
84+
let args = create_args::<i32>(size, str_len, true);
85+
group.bench_function(
86+
format!("regexp_count_with_start [size={size}, str_len={str_len}]"),
87+
|b| {
88+
b.iter(|| {
89+
let args_cloned = args.clone();
90+
black_box(invoke_regexp_count_with_args(args_cloned, size))
91+
})
92+
},
93+
);
94+
}
95+
96+
group.finish();
97+
}
98+
}
99+
100+
criterion_group!(benches, criterion_benchmark);
101+
criterion_main!(benches);

datafusion/functions/src/regex/regexpcount.rs

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -569,8 +569,16 @@ fn count_matches(
569569
));
570570
}
571571

572-
let find_slice = value.chars().skip(start as usize - 1).collect::<String>();
573-
let count = pattern.find_iter(find_slice.as_str()).count();
572+
// Find the byte offset for the start position (1-based character index)
573+
let byte_offset = value
574+
.char_indices()
575+
.nth((start as usize).saturating_sub(1))
576+
.map(|(idx, _)| idx)
577+
.unwrap_or(value.len());
578+
579+
// Use string slicing instead of collecting chars into a new String
580+
let find_slice = &value[byte_offset..];
581+
let count = pattern.find_iter(find_slice).count();
574582
Ok(count as i64)
575583
} else {
576584
let count = pattern.find_iter(value).count();

0 commit comments

Comments
 (0)