Add: UTF-8 Processing benchmarks

ashvardanian · ashvardanian · commit d519c74e265f · 2025-11-23T23:05:56.000Z
diff --git a/Cargo.toml b/Cargo.toml
@@ -29,6 +29,7 @@ bench_find = [
     "bstr",         # Byteset search
     "aho-corasick", # Byteset search
     "icu",          # Unicode properties for character matching
+    "simdutf",      # SIMD UTF-8 validation and transcoding
 ]
 bench_hash = [
     "bit-set",      # for collision counting
@@ -103,6 +104,10 @@ optional = true
 default-features = false
 features = ["compiled_data"]
 
+[dependencies.simdutf]
+version = "0.6"
+optional = true
+
 [dependencies.regex]
 version = "1.12.2"
 optional = false
diff --git a/README.md b/README.md
@@ -154,6 +154,23 @@ Current numbers should look like this:
 | `re.finditer`                   |     0.04 GiB/s |     0.19 GiB/s |
 | `stringzilla.Str.find_first_of` | __0.11 GiB/s__ | __8.79 GiB/s__ |
 
+## UTF-8 Processing
+
+On AMD Zen5 Turin CPUs on different datasets, StringZilla provides the following throughput for splitting around whitespace and newline characters on 5 vastly different languages.
+Chinese and Korean texts, for example, are both made of mostly 3-byte letters, but Korean uses a lot of whitespace characters for syllable separation, while Chinese doesn't use any.
+French and English both use a lot of single-byte whitespace characters, but French uses many accented letters that are 2-byte long in UTF-8.
+
+| Library                                   |     English |     Chinese |      Arabic |      French |      Korean |
+| ----------------------------------------- | ----------: | ----------: | ----------: | ----------: | ----------: |
+| Split around 25 whitespace characters:    |             |             |             |             |             |
+| `stringzilla::utf8_whitespace_splits`     |  0.82 GiB/s |  2.40 GiB/s |  2.40 GiB/s |  0.92 GiB/s |  1.88 GiB/s |
+| `stdlib::split(char::is_whitespace)`      |  0.77 GiB/s |  1.87 GiB/s |  1.04 GiB/s |  0.72 GiB/s |  0.98 GiB/s |
+| `icu::WhiteSpace`                         |  0.11 GiB/s |  0.16 GiB/s |  0.15 GiB/s |  0.12 GiB/s |  0.15 GiB/s |
+|                                           |             |             |             |             |             |
+| Split around 9 newline combinations:      |             |             |             |             |             |
+| `stringzilla::utf8_newline_splits`        | 15.45 GiB/s | 16.65 GiB/s | 18.34 GiB/s | 14.52 GiB/s | 16.71 GiB/s |
+| `stdlib::split(char::is_unicode_newline)` |  1.90 GiB/s |  1.93 GiB/s |  1.82 GiB/s |  1.78 GiB/s |  1.81 GiB/s |
+
 ## Sequence Operations
 
 Rust has several Dataframe libraries, DBMS and Search engines that heavily rely on string sorting and intersections.
diff --git a/bench_find.rs b/bench_find.rs
@@ -41,6 +41,8 @@ use stringtape::BytesCowsAuto;
 
 use aho_corasick::AhoCorasick;
 use bstr::ByteSlice;
+use icu::properties::props::WhiteSpace;
+use icu::properties::CodePointSetData;
 use memchr::memmem;
 use regex::bytes::Regex;
 use stringzilla::sz;
@@ -330,6 +332,203 @@ fn bench_byteset_forward(
     }
 }
 
+/// Benchmarks Unicode whitespace splitting using ICU, stdlib, and StringZilla.
+fn bench_utf8_whitespaces(
+    g: &mut criterion::BenchmarkGroup<'_, criterion::measurement::WallTime>,
+    haystack: &[u8],
+    _needles: &BytesCowsAuto,
+) {
+    g.throughput(Throughput::Bytes(haystack.len() as u64));
+
+    // Benchmark for StringZilla whitespace splits.
+    if should_run("utf8-whitespaces/stringzilla::utf8_whitespace_splits().count()") {
+        use sz::StringZillableUnary;
+        g.bench_function("stringzilla::utf8_whitespace_splits().count()", |b| {
+            b.iter(|| {
+                let haystack_bytes = black_box(haystack);
+                let count: usize = haystack_bytes.sz_utf8_whitespace_splits().count();
+                black_box(count);
+            })
+        });
+    }
+
+    // Benchmark for Rust stdlib char::is_whitespace.
+    if should_run("utf8-whitespaces/stdlib::split(char::is_whitespace).count()") {
+        g.bench_function("stdlib::split(char::is_whitespace).count()", |b| {
+            b.iter(|| {
+                let haystack_str = black_box(std::str::from_utf8(haystack).unwrap());
+                let count: usize = haystack_str
+                    .split(char::is_whitespace)
+                    .filter(|s| !s.is_empty())
+                    .count();
+                black_box(count);
+            })
+        });
+    }
+
+    // Benchmark for ICU4X WhiteSpace property.
+    if should_run("utf8-whitespaces/icu::WhiteSpace.split().count()") {
+        let white_space = CodePointSetData::new::<WhiteSpace>();
+        g.bench_function("icu::WhiteSpace.split().count()", |b| {
+            b.iter(|| {
+                let haystack_str = black_box(std::str::from_utf8(haystack).unwrap());
+                let count: usize = haystack_str
+                    .split(|c: char| white_space.contains(c))
+                    .filter(|s: &&str| !s.is_empty())
+                    .count();
+                black_box(count);
+            })
+        });
+    }
+}
+
+/// Benchmarks Unicode newline splitting using custom predicates and StringZilla.
+fn bench_utf8_newlines(
+    g: &mut criterion::BenchmarkGroup<'_, criterion::measurement::WallTime>,
+    haystack: &[u8],
+    _needles: &BytesCowsAuto,
+) {
+    g.throughput(Throughput::Bytes(haystack.len() as u64));
+
+    // Custom newline predicate matching StringZilla's 7 newline characters.
+    fn is_unicode_newline(c: char) -> bool {
+        matches!(
+            c,
+            '\n' | '\r' | '\x0B' | '\x0C' | '\u{0085}' | '\u{2028}' | '\u{2029}'
+        )
+    }
+
+    // Benchmark for StringZilla newline splits.
+    if should_run("utf8-newlines/stringzilla::utf8_newline_splits().count()") {
+        use sz::StringZillableUnary;
+        g.bench_function("stringzilla::utf8_newline_splits().count()", |b| {
+            b.iter(|| {
+                let haystack_bytes = black_box(haystack);
+                let count: usize = haystack_bytes.sz_utf8_newline_splits().count();
+                black_box(count);
+            })
+        });
+    }
+
+    // Benchmark for custom newline predicate.
+    if should_run("utf8-newlines/custom::split(is_unicode_newline).count()") {
+        g.bench_function("custom::split(is_unicode_newline).count()", |b| {
+            b.iter(|| {
+                let haystack_str = black_box(std::str::from_utf8(haystack).unwrap());
+                let count: usize = haystack_str
+                    .split(is_unicode_newline)
+                    .filter(|s| !s.is_empty())
+                    .count();
+                black_box(count);
+            })
+        });
+    }
+}
+
+/// Benchmarks UTF-8 character counting using StringZilla, simdutf, and stdlib.
+fn bench_utf8_length(
+    g: &mut criterion::BenchmarkGroup<'_, criterion::measurement::WallTime>,
+    haystack: &[u8],
+    _needles: &BytesCowsAuto,
+) {
+    g.throughput(Throughput::Bytes(haystack.len() as u64));
+
+    // Benchmark for StringZilla UTF-8 character counting.
+    if should_run("utf8-length/stringzilla::utf8_chars().len()") {
+        use sz::StringZillableUnary;
+        g.bench_function("stringzilla::utf8_chars().len()", |b| {
+            b.iter(|| {
+                let haystack_bytes = black_box(haystack);
+                let count: usize = haystack_bytes.sz_utf8_chars().len();
+                black_box(count);
+            })
+        });
+    }
+
+    // Benchmark for simdutf UTF-8 character counting.
+    if should_run("utf8-length/simdutf::count_utf8()") {
+        g.bench_function("simdutf::count_utf8()", |b| {
+            b.iter(|| {
+                let haystack_bytes = black_box(haystack);
+                let count: usize = simdutf::count_utf8(haystack_bytes);
+                black_box(count);
+            })
+        });
+    }
+
+    // Benchmark for stdlib UTF-8 character counting.
+    if should_run("utf8-length/stdlib::chars().count()") {
+        g.bench_function("stdlib::chars().count()", |b| {
+            b.iter(|| {
+                let haystack_str = black_box(std::str::from_utf8(haystack).unwrap());
+                let count: usize = haystack_str.chars().count();
+                black_box(count);
+            })
+        });
+    }
+}
+
+/// Benchmarks UTF-8 to UTF-32 decoding using StringZilla, simdutf, and stdlib.
+fn bench_utf8_iterator(
+    g: &mut criterion::BenchmarkGroup<'_, criterion::measurement::WallTime>,
+    haystack: &[u8],
+    _needles: &BytesCowsAuto,
+) {
+    g.throughput(Throughput::Bytes(haystack.len() as u64));
+
+    // Benchmark for StringZilla UTF-8 character iteration.
+    if should_run("utf8-iterator/stringzilla::utf8_chars().iter()") {
+        use sz::StringZillableUnary;
+        g.bench_function("stringzilla::utf8_chars().iter()", |b| {
+            b.iter(|| {
+                let haystack_bytes = black_box(haystack);
+                let mut sum: u32 = 0;
+                for ch in haystack_bytes.sz_utf8_chars().iter() {
+                    sum = sum.wrapping_add(ch as u32);
+                }
+                black_box(sum);
+            })
+        });
+    }
+
+    // Benchmark for simdutf UTF-8 to UTF-32 conversion.
+    if should_run("utf8-iterator/simdutf::convert_utf8_to_utf32()") {
+        // Pre-allocate buffer for UTF-32 output (worst case: same number of codepoints as bytes)
+        let mut utf32_buffer = vec![0u32; haystack.len()];
+        g.bench_function("simdutf::convert_utf8_to_utf32()", |b| {
+            b.iter(|| {
+                let haystack_bytes = black_box(haystack);
+                let len = unsafe {
+                    simdutf::convert_utf8_to_utf32(
+                        haystack_bytes.as_ptr(),
+                        haystack_bytes.len(),
+                        utf32_buffer.as_mut_ptr(),
+                    )
+                };
+                let mut sum: u32 = 0;
+                for i in 0..len {
+                    sum = sum.wrapping_add(utf32_buffer[i]);
+                }
+                black_box(sum);
+            })
+        });
+    }
+
+    // Benchmark for stdlib UTF-8 character iteration.
+    if should_run("utf8-iterator/stdlib::chars()") {
+        g.bench_function("stdlib::chars()", |b| {
+            b.iter(|| {
+                let haystack_str = black_box(unsafe { std::str::from_utf8_unchecked(haystack) });
+                let mut sum: u32 = 0;
+                for ch in haystack_str.chars() {
+                    sum = sum.wrapping_add(ch as u32);
+                }
+                black_box(sum);
+            })
+        });
+    }
+}
+
 fn main() {
     log_stringzilla_metadata();
 
@@ -360,5 +559,25 @@ fn main() {
     bench_byteset_forward(&mut group, &haystack, &needles);
     group.finish();
 
+    // Benchmarks for Unicode whitespace splitting
+    let mut group = criterion.benchmark_group("utf8-whitespaces");
+    bench_utf8_whitespaces(&mut group, &haystack, &needles);
+    group.finish();
+
+    // Benchmarks for Unicode newline splitting
+    let mut group = criterion.benchmark_group("utf8-newlines");
+    bench_utf8_newlines(&mut group, &haystack, &needles);
+    group.finish();
+
+    // Benchmarks for UTF-8 character counting
+    let mut group = criterion.benchmark_group("utf8-length");
+    bench_utf8_length(&mut group, &haystack, &needles);
+    group.finish();
+
+    // Benchmarks for UTF-8 character iteration
+    let mut group = criterion.benchmark_group("utf8-iterator");
+    bench_utf8_iterator(&mut group, &haystack, &needles);
+    group.finish();
+
     criterion.final_summary();
 }