@@ -41,6 +41,8 @@ use stringtape::BytesCowsAuto;
4141
4242use aho_corasick:: AhoCorasick ;
4343use bstr:: ByteSlice ;
44+ use icu:: properties:: props:: WhiteSpace ;
45+ use icu:: properties:: CodePointSetData ;
4446use memchr:: memmem;
4547use regex:: bytes:: Regex ;
4648use stringzilla:: sz;
@@ -330,6 +332,203 @@ fn bench_byteset_forward(
330332 }
331333}
332334
335+ /// Benchmarks Unicode whitespace splitting using ICU, stdlib, and StringZilla.
336+ fn bench_utf8_whitespaces (
337+ g : & mut criterion:: BenchmarkGroup < ' _ , criterion:: measurement:: WallTime > ,
338+ haystack : & [ u8 ] ,
339+ _needles : & BytesCowsAuto ,
340+ ) {
341+ g. throughput ( Throughput :: Bytes ( haystack. len ( ) as u64 ) ) ;
342+
343+ // Benchmark for StringZilla whitespace splits.
344+ if should_run ( "utf8-whitespaces/stringzilla::utf8_whitespace_splits().count()" ) {
345+ use sz:: StringZillableUnary ;
346+ g. bench_function ( "stringzilla::utf8_whitespace_splits().count()" , |b| {
347+ b. iter ( || {
348+ let haystack_bytes = black_box ( haystack) ;
349+ let count: usize = haystack_bytes. sz_utf8_whitespace_splits ( ) . count ( ) ;
350+ black_box ( count) ;
351+ } )
352+ } ) ;
353+ }
354+
355+ // Benchmark for Rust stdlib char::is_whitespace.
356+ if should_run ( "utf8-whitespaces/stdlib::split(char::is_whitespace).count()" ) {
357+ g. bench_function ( "stdlib::split(char::is_whitespace).count()" , |b| {
358+ b. iter ( || {
359+ let haystack_str = black_box ( std:: str:: from_utf8 ( haystack) . unwrap ( ) ) ;
360+ let count: usize = haystack_str
361+ . split ( char:: is_whitespace)
362+ . filter ( |s| !s. is_empty ( ) )
363+ . count ( ) ;
364+ black_box ( count) ;
365+ } )
366+ } ) ;
367+ }
368+
369+ // Benchmark for ICU4X WhiteSpace property.
370+ if should_run ( "utf8-whitespaces/icu::WhiteSpace.split().count()" ) {
371+ let white_space = CodePointSetData :: new :: < WhiteSpace > ( ) ;
372+ g. bench_function ( "icu::WhiteSpace.split().count()" , |b| {
373+ b. iter ( || {
374+ let haystack_str = black_box ( std:: str:: from_utf8 ( haystack) . unwrap ( ) ) ;
375+ let count: usize = haystack_str
376+ . split ( |c : char | white_space. contains ( c) )
377+ . filter ( |s : & & str | !s. is_empty ( ) )
378+ . count ( ) ;
379+ black_box ( count) ;
380+ } )
381+ } ) ;
382+ }
383+ }
384+
385+ /// Benchmarks Unicode newline splitting using custom predicates and StringZilla.
386+ fn bench_utf8_newlines (
387+ g : & mut criterion:: BenchmarkGroup < ' _ , criterion:: measurement:: WallTime > ,
388+ haystack : & [ u8 ] ,
389+ _needles : & BytesCowsAuto ,
390+ ) {
391+ g. throughput ( Throughput :: Bytes ( haystack. len ( ) as u64 ) ) ;
392+
393+ // Custom newline predicate matching StringZilla's 7 newline characters.
394+ fn is_unicode_newline ( c : char ) -> bool {
395+ matches ! (
396+ c,
397+ '\n' | '\r' | '\x0B' | '\x0C' | '\u{0085}' | '\u{2028}' | '\u{2029}'
398+ )
399+ }
400+
401+ // Benchmark for StringZilla newline splits.
402+ if should_run ( "utf8-newlines/stringzilla::utf8_newline_splits().count()" ) {
403+ use sz:: StringZillableUnary ;
404+ g. bench_function ( "stringzilla::utf8_newline_splits().count()" , |b| {
405+ b. iter ( || {
406+ let haystack_bytes = black_box ( haystack) ;
407+ let count: usize = haystack_bytes. sz_utf8_newline_splits ( ) . count ( ) ;
408+ black_box ( count) ;
409+ } )
410+ } ) ;
411+ }
412+
413+ // Benchmark for custom newline predicate.
414+ if should_run ( "utf8-newlines/custom::split(is_unicode_newline).count()" ) {
415+ g. bench_function ( "custom::split(is_unicode_newline).count()" , |b| {
416+ b. iter ( || {
417+ let haystack_str = black_box ( std:: str:: from_utf8 ( haystack) . unwrap ( ) ) ;
418+ let count: usize = haystack_str
419+ . split ( is_unicode_newline)
420+ . filter ( |s| !s. is_empty ( ) )
421+ . count ( ) ;
422+ black_box ( count) ;
423+ } )
424+ } ) ;
425+ }
426+ }
427+
428+ /// Benchmarks UTF-8 character counting using StringZilla, simdutf, and stdlib.
429+ fn bench_utf8_length (
430+ g : & mut criterion:: BenchmarkGroup < ' _ , criterion:: measurement:: WallTime > ,
431+ haystack : & [ u8 ] ,
432+ _needles : & BytesCowsAuto ,
433+ ) {
434+ g. throughput ( Throughput :: Bytes ( haystack. len ( ) as u64 ) ) ;
435+
436+ // Benchmark for StringZilla UTF-8 character counting.
437+ if should_run ( "utf8-length/stringzilla::utf8_chars().len()" ) {
438+ use sz:: StringZillableUnary ;
439+ g. bench_function ( "stringzilla::utf8_chars().len()" , |b| {
440+ b. iter ( || {
441+ let haystack_bytes = black_box ( haystack) ;
442+ let count: usize = haystack_bytes. sz_utf8_chars ( ) . len ( ) ;
443+ black_box ( count) ;
444+ } )
445+ } ) ;
446+ }
447+
448+ // Benchmark for simdutf UTF-8 character counting.
449+ if should_run ( "utf8-length/simdutf::count_utf8()" ) {
450+ g. bench_function ( "simdutf::count_utf8()" , |b| {
451+ b. iter ( || {
452+ let haystack_bytes = black_box ( haystack) ;
453+ let count: usize = simdutf:: count_utf8 ( haystack_bytes) ;
454+ black_box ( count) ;
455+ } )
456+ } ) ;
457+ }
458+
459+ // Benchmark for stdlib UTF-8 character counting.
460+ if should_run ( "utf8-length/stdlib::chars().count()" ) {
461+ g. bench_function ( "stdlib::chars().count()" , |b| {
462+ b. iter ( || {
463+ let haystack_str = black_box ( std:: str:: from_utf8 ( haystack) . unwrap ( ) ) ;
464+ let count: usize = haystack_str. chars ( ) . count ( ) ;
465+ black_box ( count) ;
466+ } )
467+ } ) ;
468+ }
469+ }
470+
471+ /// Benchmarks UTF-8 to UTF-32 decoding using StringZilla, simdutf, and stdlib.
472+ fn bench_utf8_iterator (
473+ g : & mut criterion:: BenchmarkGroup < ' _ , criterion:: measurement:: WallTime > ,
474+ haystack : & [ u8 ] ,
475+ _needles : & BytesCowsAuto ,
476+ ) {
477+ g. throughput ( Throughput :: Bytes ( haystack. len ( ) as u64 ) ) ;
478+
479+ // Benchmark for StringZilla UTF-8 character iteration.
480+ if should_run ( "utf8-iterator/stringzilla::utf8_chars().iter()" ) {
481+ use sz:: StringZillableUnary ;
482+ g. bench_function ( "stringzilla::utf8_chars().iter()" , |b| {
483+ b. iter ( || {
484+ let haystack_bytes = black_box ( haystack) ;
485+ let mut sum: u32 = 0 ;
486+ for ch in haystack_bytes. sz_utf8_chars ( ) . iter ( ) {
487+ sum = sum. wrapping_add ( ch as u32 ) ;
488+ }
489+ black_box ( sum) ;
490+ } )
491+ } ) ;
492+ }
493+
494+ // Benchmark for simdutf UTF-8 to UTF-32 conversion.
495+ if should_run ( "utf8-iterator/simdutf::convert_utf8_to_utf32()" ) {
496+ // Pre-allocate buffer for UTF-32 output (worst case: same number of codepoints as bytes)
497+ let mut utf32_buffer = vec ! [ 0u32 ; haystack. len( ) ] ;
498+ g. bench_function ( "simdutf::convert_utf8_to_utf32()" , |b| {
499+ b. iter ( || {
500+ let haystack_bytes = black_box ( haystack) ;
501+ let len = unsafe {
502+ simdutf:: convert_utf8_to_utf32 (
503+ haystack_bytes. as_ptr ( ) ,
504+ haystack_bytes. len ( ) ,
505+ utf32_buffer. as_mut_ptr ( ) ,
506+ )
507+ } ;
508+ let mut sum: u32 = 0 ;
509+ for i in 0 ..len {
510+ sum = sum. wrapping_add ( utf32_buffer[ i] ) ;
511+ }
512+ black_box ( sum) ;
513+ } )
514+ } ) ;
515+ }
516+
517+ // Benchmark for stdlib UTF-8 character iteration.
518+ if should_run ( "utf8-iterator/stdlib::chars()" ) {
519+ g. bench_function ( "stdlib::chars()" , |b| {
520+ b. iter ( || {
521+ let haystack_str = black_box ( unsafe { std:: str:: from_utf8_unchecked ( haystack) } ) ;
522+ let mut sum: u32 = 0 ;
523+ for ch in haystack_str. chars ( ) {
524+ sum = sum. wrapping_add ( ch as u32 ) ;
525+ }
526+ black_box ( sum) ;
527+ } )
528+ } ) ;
529+ }
530+ }
531+
333532fn main ( ) {
334533 log_stringzilla_metadata ( ) ;
335534
@@ -360,5 +559,25 @@ fn main() {
360559 bench_byteset_forward ( & mut group, & haystack, & needles) ;
361560 group. finish ( ) ;
362561
562+ // Benchmarks for Unicode whitespace splitting
563+ let mut group = criterion. benchmark_group ( "utf8-whitespaces" ) ;
564+ bench_utf8_whitespaces ( & mut group, & haystack, & needles) ;
565+ group. finish ( ) ;
566+
567+ // Benchmarks for Unicode newline splitting
568+ let mut group = criterion. benchmark_group ( "utf8-newlines" ) ;
569+ bench_utf8_newlines ( & mut group, & haystack, & needles) ;
570+ group. finish ( ) ;
571+
572+ // Benchmarks for UTF-8 character counting
573+ let mut group = criterion. benchmark_group ( "utf8-length" ) ;
574+ bench_utf8_length ( & mut group, & haystack, & needles) ;
575+ group. finish ( ) ;
576+
577+ // Benchmarks for UTF-8 character iteration
578+ let mut group = criterion. benchmark_group ( "utf8-iterator" ) ;
579+ bench_utf8_iterator ( & mut group, & haystack, & needles) ;
580+ group. finish ( ) ;
581+
363582 criterion. final_summary ( ) ;
364583}
0 commit comments