perf: optimize regexp_count to avoid String allocation when start position is provided

viirya · viirya · commit 8f465d67fc50 · 2025-12-29T10:24:57.000-08:00
Replace `.chars().skip().collect::&lt;String&gt;()` with zero-copy string slicing
using `char_indices()` to find the byte offset, then slice with `&amp;value[byte_offset..]`.

This eliminates unnecessary String allocation per row when a start position
is specified.

Changes:
- Use char_indices().nth() to find byte offset for start position (1-based)
- Use string slicing &amp;value[byte_offset..] instead of collecting chars
- Added benchmark to measure performance improvements

Optimization:
- Before: Allocated new String via .collect() for each row with start position
- After: Uses zero-copy string slice

Benchmark results:
- size=1024, str_len=32:  96.361 µs -&gt; 41.458 µs (57.0% faster, 2.3x speedup)
- size=1024, str_len=128: 210.16 µs -&gt; 56.064 µs (73.3% faster, 3.7x speedup)
- size=4096, str_len=32:  376.90 µs -&gt; 162.98 µs (56.8% faster, 2.3x speedup)
- size=4096, str_len=128: 855.68 µs -&gt; 263.61 µs (69.2% faster, 3.2x speedup)

The optimization shows greater improvements for longer strings (up to 73% faster)
since string slicing is O(1) regardless of length, while the previous approach
had allocation costs that grew with string length.
diff --git a/datafusion/functions/Cargo.toml b/datafusion/functions/Cargo.toml
@@ -270,6 +270,11 @@ harness = false
 name = "ends_with"
 required-features = ["string_expressions"]
 
+[[bench]]
+harness = false
+name = "regexp_count"
+required-features = ["regex_expressions"]
+
 [[bench]]
 harness = false
 name = "translate"
diff --git a/datafusion/functions/benches/regexp_count.rs b/datafusion/functions/benches/regexp_count.rs
@@ -0,0 +1,101 @@
+extern crate criterion;
+
+use arrow::array::Int64Array;
+use arrow::array::OffsetSizeTrait;
+use arrow::datatypes::{DataType, Field};
+use arrow::util::bench_util::create_string_array_with_len;
+use criterion::{Criterion, SamplingMode, criterion_group, criterion_main};
+use datafusion_common::config::ConfigOptions;
+use datafusion_common::{DataFusionError, ScalarValue};
+use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
+use datafusion_functions::regex;
+use std::hint::black_box;
+use std::sync::Arc;
+use std::time::Duration;
+
+fn create_args<O: OffsetSizeTrait>(
+    size: usize,
+    str_len: usize,
+    with_start: bool,
+) -> Vec<ColumnarValue> {
+    let string_array = Arc::new(create_string_array_with_len::<O>(size, 0.1, str_len));
+
+    // Use a simple pattern that matches common characters
+    let pattern = ColumnarValue::Scalar(ScalarValue::Utf8(Some("a".to_string())));
+
+    if with_start {
+        // Test with start position (this is where the optimization matters)
+        let start_array = Arc::new(Int64Array::from(
+            (0..size).map(|i| (i % 10 + 1) as i64).collect::<Vec<_>>(),
+        ));
+        vec![
+            ColumnarValue::Array(string_array),
+            pattern,
+            ColumnarValue::Array(start_array),
+        ]
+    } else {
+        vec![ColumnarValue::Array(string_array), pattern]
+    }
+}
+
+fn invoke_regexp_count_with_args(
+    args: Vec<ColumnarValue>,
+    number_rows: usize,
+) -> Result<ColumnarValue, DataFusionError> {
+    let arg_fields = args
+        .iter()
+        .enumerate()
+        .map(|(idx, arg)| Field::new(format!("arg_{idx}"), arg.data_type(), true).into())
+        .collect::<Vec<_>>();
+    let config_options = Arc::new(ConfigOptions::default());
+
+    regex::regexp_count().invoke_with_args(ScalarFunctionArgs {
+        args,
+        arg_fields,
+        number_rows,
+        return_field: Field::new("f", DataType::Int64, true).into(),
+        config_options: Arc::clone(&config_options),
+    })
+}
+
+fn criterion_benchmark(c: &mut Criterion) {
+    for size in [1024, 4096] {
+        let mut group = c.benchmark_group(format!("regexp_count size={size}"));
+        group.sampling_mode(SamplingMode::Flat);
+        group.sample_size(10);
+        group.measurement_time(Duration::from_secs(10));
+
+        // Test without start position (no optimization impact)
+        for str_len in [32, 128] {
+            let args = create_args::<i32>(size, str_len, false);
+            group.bench_function(
+                format!("regexp_count_no_start [size={size}, str_len={str_len}]"),
+                |b| {
+                    b.iter(|| {
+                        let args_cloned = args.clone();
+                        black_box(invoke_regexp_count_with_args(args_cloned, size))
+                    })
+                },
+            );
+        }
+
+        // Test with start position (optimization should help here)
+        for str_len in [32, 128] {
+            let args = create_args::<i32>(size, str_len, true);
+            group.bench_function(
+                format!("regexp_count_with_start [size={size}, str_len={str_len}]"),
+                |b| {
+                    b.iter(|| {
+                        let args_cloned = args.clone();
+                        black_box(invoke_regexp_count_with_args(args_cloned, size))
+                    })
+                },
+            );
+        }
+
+        group.finish();
+    }
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
diff --git a/datafusion/functions/src/regex/regexpcount.rs b/datafusion/functions/src/regex/regexpcount.rs
@@ -569,8 +569,16 @@ fn count_matches(
             ));
         }
 
-        let find_slice = value.chars().skip(start as usize - 1).collect::<String>();
-        let count = pattern.find_iter(find_slice.as_str()).count();
+        // Find the byte offset for the start position (1-based character index)
+        let byte_offset = value
+            .char_indices()
+            .nth((start as usize).saturating_sub(1))
+            .map(|(idx, _)| idx)
+            .unwrap_or(value.len());
+
+        // Use string slicing instead of collecting chars into a new String
+        let find_slice = &value[byte_offset..];
+        let count = pattern.find_iter(find_slice).count();
         Ok(count as i64)
     } else {
         let count = pattern.find_iter(value).count();