Skip to content

Commit d519c74

Browse files
committed
Add: UTF-8 Processing benchmarks
1 parent 789a69d commit d519c74

File tree

3 files changed

+241
-0
lines changed

3 files changed

+241
-0
lines changed

Cargo.toml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ bench_find = [
2929
"bstr", # Byteset search
3030
"aho-corasick", # Byteset search
3131
"icu", # Unicode properties for character matching
32+
"simdutf", # SIMD UTF-8 validation and transcoding
3233
]
3334
bench_hash = [
3435
"bit-set", # for collision counting
@@ -103,6 +104,10 @@ optional = true
103104
default-features = false
104105
features = ["compiled_data"]
105106

107+
[dependencies.simdutf]
108+
version = "0.6"
109+
optional = true
110+
106111
[dependencies.regex]
107112
version = "1.12.2"
108113
optional = false

README.md

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,23 @@ Current numbers should look like this:
154154
| `re.finditer` | 0.04 GiB/s | 0.19 GiB/s |
155155
| `stringzilla.Str.find_first_of` | __0.11 GiB/s__ | __8.79 GiB/s__ |
156156

157+
## UTF-8 Processing
158+
159+
On AMD Zen5 Turin CPUs on different datasets, StringZilla provides the following throughput for splitting around whitespace and newline characters on 5 vastly different languages.
160+
Chinese and Korean texts, for example, are both made of mostly 3-byte letters, but Korean uses a lot of whitespace characters for syllable separation, while Chinese doesn't use any.
161+
French and English both use a lot of single-byte whitespace characters, but French uses many accented letters that are 2-byte long in UTF-8.
162+
163+
| Library | English | Chinese | Arabic | French | Korean |
164+
| ----------------------------------------- | ----------: | ----------: | ----------: | ----------: | ----------: |
165+
| Split around 25 whitespace characters: | | | | | |
166+
| `stringzilla::utf8_whitespace_splits` | 0.82 GiB/s | 2.40 GiB/s | 2.40 GiB/s | 0.92 GiB/s | 1.88 GiB/s |
167+
| `stdlib::split(char::is_whitespace)` | 0.77 GiB/s | 1.87 GiB/s | 1.04 GiB/s | 0.72 GiB/s | 0.98 GiB/s |
168+
| `icu::WhiteSpace` | 0.11 GiB/s | 0.16 GiB/s | 0.15 GiB/s | 0.12 GiB/s | 0.15 GiB/s |
169+
| | | | | | |
170+
| Split around 9 newline combinations: | | | | | |
171+
| `stringzilla::utf8_newline_splits` | 15.45 GiB/s | 16.65 GiB/s | 18.34 GiB/s | 14.52 GiB/s | 16.71 GiB/s |
172+
| `stdlib::split(char::is_unicode_newline)` | 1.90 GiB/s | 1.93 GiB/s | 1.82 GiB/s | 1.78 GiB/s | 1.81 GiB/s |
173+
157174
## Sequence Operations
158175

159176
Rust has several Dataframe libraries, DBMS and Search engines that heavily rely on string sorting and intersections.

bench_find.rs

Lines changed: 219 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,8 @@ use stringtape::BytesCowsAuto;
4141

4242
use aho_corasick::AhoCorasick;
4343
use bstr::ByteSlice;
44+
use icu::properties::props::WhiteSpace;
45+
use icu::properties::CodePointSetData;
4446
use memchr::memmem;
4547
use regex::bytes::Regex;
4648
use stringzilla::sz;
@@ -330,6 +332,203 @@ fn bench_byteset_forward(
330332
}
331333
}
332334

335+
/// Benchmarks Unicode whitespace splitting using ICU, stdlib, and StringZilla.
336+
fn bench_utf8_whitespaces(
337+
g: &mut criterion::BenchmarkGroup<'_, criterion::measurement::WallTime>,
338+
haystack: &[u8],
339+
_needles: &BytesCowsAuto,
340+
) {
341+
g.throughput(Throughput::Bytes(haystack.len() as u64));
342+
343+
// Benchmark for StringZilla whitespace splits.
344+
if should_run("utf8-whitespaces/stringzilla::utf8_whitespace_splits().count()") {
345+
use sz::StringZillableUnary;
346+
g.bench_function("stringzilla::utf8_whitespace_splits().count()", |b| {
347+
b.iter(|| {
348+
let haystack_bytes = black_box(haystack);
349+
let count: usize = haystack_bytes.sz_utf8_whitespace_splits().count();
350+
black_box(count);
351+
})
352+
});
353+
}
354+
355+
// Benchmark for Rust stdlib char::is_whitespace.
356+
if should_run("utf8-whitespaces/stdlib::split(char::is_whitespace).count()") {
357+
g.bench_function("stdlib::split(char::is_whitespace).count()", |b| {
358+
b.iter(|| {
359+
let haystack_str = black_box(std::str::from_utf8(haystack).unwrap());
360+
let count: usize = haystack_str
361+
.split(char::is_whitespace)
362+
.filter(|s| !s.is_empty())
363+
.count();
364+
black_box(count);
365+
})
366+
});
367+
}
368+
369+
// Benchmark for ICU4X WhiteSpace property.
370+
if should_run("utf8-whitespaces/icu::WhiteSpace.split().count()") {
371+
let white_space = CodePointSetData::new::<WhiteSpace>();
372+
g.bench_function("icu::WhiteSpace.split().count()", |b| {
373+
b.iter(|| {
374+
let haystack_str = black_box(std::str::from_utf8(haystack).unwrap());
375+
let count: usize = haystack_str
376+
.split(|c: char| white_space.contains(c))
377+
.filter(|s: &&str| !s.is_empty())
378+
.count();
379+
black_box(count);
380+
})
381+
});
382+
}
383+
}
384+
385+
/// Benchmarks Unicode newline splitting using custom predicates and StringZilla.
386+
fn bench_utf8_newlines(
387+
g: &mut criterion::BenchmarkGroup<'_, criterion::measurement::WallTime>,
388+
haystack: &[u8],
389+
_needles: &BytesCowsAuto,
390+
) {
391+
g.throughput(Throughput::Bytes(haystack.len() as u64));
392+
393+
// Custom newline predicate matching StringZilla's 7 newline characters.
394+
fn is_unicode_newline(c: char) -> bool {
395+
matches!(
396+
c,
397+
'\n' | '\r' | '\x0B' | '\x0C' | '\u{0085}' | '\u{2028}' | '\u{2029}'
398+
)
399+
}
400+
401+
// Benchmark for StringZilla newline splits.
402+
if should_run("utf8-newlines/stringzilla::utf8_newline_splits().count()") {
403+
use sz::StringZillableUnary;
404+
g.bench_function("stringzilla::utf8_newline_splits().count()", |b| {
405+
b.iter(|| {
406+
let haystack_bytes = black_box(haystack);
407+
let count: usize = haystack_bytes.sz_utf8_newline_splits().count();
408+
black_box(count);
409+
})
410+
});
411+
}
412+
413+
// Benchmark for custom newline predicate.
414+
if should_run("utf8-newlines/custom::split(is_unicode_newline).count()") {
415+
g.bench_function("custom::split(is_unicode_newline).count()", |b| {
416+
b.iter(|| {
417+
let haystack_str = black_box(std::str::from_utf8(haystack).unwrap());
418+
let count: usize = haystack_str
419+
.split(is_unicode_newline)
420+
.filter(|s| !s.is_empty())
421+
.count();
422+
black_box(count);
423+
})
424+
});
425+
}
426+
}
427+
428+
/// Benchmarks UTF-8 character counting using StringZilla, simdutf, and stdlib.
429+
fn bench_utf8_length(
430+
g: &mut criterion::BenchmarkGroup<'_, criterion::measurement::WallTime>,
431+
haystack: &[u8],
432+
_needles: &BytesCowsAuto,
433+
) {
434+
g.throughput(Throughput::Bytes(haystack.len() as u64));
435+
436+
// Benchmark for StringZilla UTF-8 character counting.
437+
if should_run("utf8-length/stringzilla::utf8_chars().len()") {
438+
use sz::StringZillableUnary;
439+
g.bench_function("stringzilla::utf8_chars().len()", |b| {
440+
b.iter(|| {
441+
let haystack_bytes = black_box(haystack);
442+
let count: usize = haystack_bytes.sz_utf8_chars().len();
443+
black_box(count);
444+
})
445+
});
446+
}
447+
448+
// Benchmark for simdutf UTF-8 character counting.
449+
if should_run("utf8-length/simdutf::count_utf8()") {
450+
g.bench_function("simdutf::count_utf8()", |b| {
451+
b.iter(|| {
452+
let haystack_bytes = black_box(haystack);
453+
let count: usize = simdutf::count_utf8(haystack_bytes);
454+
black_box(count);
455+
})
456+
});
457+
}
458+
459+
// Benchmark for stdlib UTF-8 character counting.
460+
if should_run("utf8-length/stdlib::chars().count()") {
461+
g.bench_function("stdlib::chars().count()", |b| {
462+
b.iter(|| {
463+
let haystack_str = black_box(std::str::from_utf8(haystack).unwrap());
464+
let count: usize = haystack_str.chars().count();
465+
black_box(count);
466+
})
467+
});
468+
}
469+
}
470+
471+
/// Benchmarks UTF-8 to UTF-32 decoding using StringZilla, simdutf, and stdlib.
472+
fn bench_utf8_iterator(
473+
g: &mut criterion::BenchmarkGroup<'_, criterion::measurement::WallTime>,
474+
haystack: &[u8],
475+
_needles: &BytesCowsAuto,
476+
) {
477+
g.throughput(Throughput::Bytes(haystack.len() as u64));
478+
479+
// Benchmark for StringZilla UTF-8 character iteration.
480+
if should_run("utf8-iterator/stringzilla::utf8_chars().iter()") {
481+
use sz::StringZillableUnary;
482+
g.bench_function("stringzilla::utf8_chars().iter()", |b| {
483+
b.iter(|| {
484+
let haystack_bytes = black_box(haystack);
485+
let mut sum: u32 = 0;
486+
for ch in haystack_bytes.sz_utf8_chars().iter() {
487+
sum = sum.wrapping_add(ch as u32);
488+
}
489+
black_box(sum);
490+
})
491+
});
492+
}
493+
494+
// Benchmark for simdutf UTF-8 to UTF-32 conversion.
495+
if should_run("utf8-iterator/simdutf::convert_utf8_to_utf32()") {
496+
// Pre-allocate buffer for UTF-32 output (worst case: same number of codepoints as bytes)
497+
let mut utf32_buffer = vec![0u32; haystack.len()];
498+
g.bench_function("simdutf::convert_utf8_to_utf32()", |b| {
499+
b.iter(|| {
500+
let haystack_bytes = black_box(haystack);
501+
let len = unsafe {
502+
simdutf::convert_utf8_to_utf32(
503+
haystack_bytes.as_ptr(),
504+
haystack_bytes.len(),
505+
utf32_buffer.as_mut_ptr(),
506+
)
507+
};
508+
let mut sum: u32 = 0;
509+
for i in 0..len {
510+
sum = sum.wrapping_add(utf32_buffer[i]);
511+
}
512+
black_box(sum);
513+
})
514+
});
515+
}
516+
517+
// Benchmark for stdlib UTF-8 character iteration.
518+
if should_run("utf8-iterator/stdlib::chars()") {
519+
g.bench_function("stdlib::chars()", |b| {
520+
b.iter(|| {
521+
let haystack_str = black_box(unsafe { std::str::from_utf8_unchecked(haystack) });
522+
let mut sum: u32 = 0;
523+
for ch in haystack_str.chars() {
524+
sum = sum.wrapping_add(ch as u32);
525+
}
526+
black_box(sum);
527+
})
528+
});
529+
}
530+
}
531+
333532
fn main() {
334533
log_stringzilla_metadata();
335534

@@ -360,5 +559,25 @@ fn main() {
360559
bench_byteset_forward(&mut group, &haystack, &needles);
361560
group.finish();
362561

562+
// Benchmarks for Unicode whitespace splitting
563+
let mut group = criterion.benchmark_group("utf8-whitespaces");
564+
bench_utf8_whitespaces(&mut group, &haystack, &needles);
565+
group.finish();
566+
567+
// Benchmarks for Unicode newline splitting
568+
let mut group = criterion.benchmark_group("utf8-newlines");
569+
bench_utf8_newlines(&mut group, &haystack, &needles);
570+
group.finish();
571+
572+
// Benchmarks for UTF-8 character counting
573+
let mut group = criterion.benchmark_group("utf8-length");
574+
bench_utf8_length(&mut group, &haystack, &needles);
575+
group.finish();
576+
577+
// Benchmarks for UTF-8 character iteration
578+
let mut group = criterion.benchmark_group("utf8-iterator");
579+
bench_utf8_iterator(&mut group, &haystack, &needles);
580+
group.finish();
581+
363582
criterion.final_summary();
364583
}

0 commit comments

Comments
 (0)