perf: optimize strpos by eliminating double iteration for UTF-8 (#19572)

viirya · web-flow · commit 195d3d64bb9f · 2026-01-01T21:24:30.000Z
## Which issue does this PR close?  - Closes #. ## Rationale for this change  ## What changes are included in this PR?  For non-ASCII strings, the original implementation used string.find() to get the byte index, then counted characters up to that byte index. This required two passes through the string. This optimization uses char_indices() to find the substring while simultaneously tracking character positions, completing the search in a single pass. Benchmark results (UTF-8 strings): - str_len_8: 188.98 µs → 140.54 µs (25.4% faster) - str_len_32: 615.69 µs → 294.15 µs (52.2% faster) - str_len_128: 2.2707 ms → 1.2462 ms (45.1% faster) - str_len_4096: 74.328 ms → 36.538 ms (50.9% faster) ASCII performance unchanged (already optimized with fast path). ## Are these changes tested?  ## Are there any user-facing changes?
diff --git a/datafusion/functions/src/unicode/strpos.rs b/datafusion/functions/src/unicode/strpos.rs
@@ -215,14 +215,37 @@ where
                         )
                     }
                 } else {
-                    // The `find` method returns the byte index of the substring.
-                    // We count the number of chars up to that byte index.
-                    T::Native::from_usize(
-                        string
-                            .find(substring)
-                            .map(|x| string[..x].chars().count() + 1)
-                            .unwrap_or(0),
-                    )
+                    // For non-ASCII, use a single-pass search that tracks both
+                    // byte position and character position simultaneously
+                    if substring.is_empty() {
+                        return T::Native::from_usize(1);
+                    }
+
+                    let substring_bytes = substring.as_bytes();
+                    let string_bytes = string.as_bytes();
+
+                    if substring_bytes.len() > string_bytes.len() {
+                        return T::Native::from_usize(0);
+                    }
+
+                    // Single pass: find substring while counting characters
+                    let mut char_pos = 0;
+                    for (byte_idx, _) in string.char_indices() {
+                        char_pos += 1;
+                        if byte_idx + substring_bytes.len() <= string_bytes.len() {
+                            // SAFETY: We just checked that byte_idx + substring_bytes.len() <= string_bytes.len()
+                            let slice = unsafe {
+                                string_bytes.get_unchecked(
+                                    byte_idx..byte_idx + substring_bytes.len(),
+                                )
+                            };
+                            if slice == substring_bytes {
+                                return T::Native::from_usize(char_pos);
+                            }
+                        }
+                    }
+
+                    T::Native::from_usize(0)
                 }
             }
             _ => None,