unicode-rs · Manishearth · Jul 28, 2025 · Jul 11, 2025 · Jul 11, 2025 · Jul 11, 2025
diff --git a/Cargo.toml b/Cargo.toml
@@ -24,6 +24,7 @@ no_std = [] # This is a no-op, preserved for backward compatibility only.
 [dev-dependencies]
 quickcheck = "0.7"
 criterion = "0.5"
+proptest = "1.7.0"
 
 [[bench]]
 name = "chars"
@@ -36,3 +37,8 @@ harness = false
 [[bench]]
 name = "word_bounds"
 harness = false
+
+[[bench]]
+name = "unicode_word_indices"
+harness = false
+
diff --git a/benches/chars.rs b/benches/chars.rs
@@ -41,15 +41,15 @@ fn bench_all(c: &mut Criterion) {
     for file in FILES {
         group.bench_with_input(
             BenchmarkId::new("grapheme", file),
-            &fs::read_to_string(format!("benches/texts/{}.txt", file)).unwrap(),
+            &fs::read_to_string(format!("benches/texts/{file}.txt")).unwrap(),
             |b, content| b.iter(|| grapheme(content)),
         );
     }
 
     for file in FILES {
         group.bench_with_input(
             BenchmarkId::new("scalar", file),
-            &fs::read_to_string(format!("benches/texts/{}.txt", file)).unwrap(),
+            &fs::read_to_string(format!("benches/texts/{file}.txt")).unwrap(),
             |b, content| b.iter(|| scalar(content)),
         );
     }

diff --git a/benches/texts/log.txt b/benches/texts/log.txt
@@ -0,0 +1 @@
+2018-07-12 13:59:01 UTC | ERROR | (worker.go:131 in process) | Too many errors for endpoint 'dummy/api/v1/check_run?api_key=*************************00000': retrying later
diff --git a/benches/unicode_word_indices.rs b/benches/unicode_word_indices.rs
@@ -0,0 +1,37 @@
+use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion};
+
+use std::fs;
+use unicode_segmentation::UnicodeSegmentation;
+
+const FILES: &[&str] = &[
+    "log", //"arabic",
+    "english",
+    //"hindi",
+    "japanese",
+    //"korean",
+    //"mandarin",
+    //"russian",
+    //"source_code",
+];
+
+#[inline(always)]
+fn grapheme(text: &str) {
+    for w in text.unicode_word_indices() {
+        black_box(w);
+    }
+}
+
+fn bench_all(c: &mut Criterion) {
+    let mut group = c.benchmark_group("unicode_word_indices");
+
+    for file in FILES {
+        let input = fs::read_to_string(format!("benches/texts/{file}.txt")).unwrap();
+        group.throughput(criterion::Throughput::Bytes(input.len() as u64));
+        group.bench_with_input(BenchmarkId::from_parameter(file), &input, |b, content| {
+            b.iter(|| grapheme(content))
+        });
+    }
+}
+
+criterion_group!(benches, bench_all);
+criterion_main!(benches);
diff --git a/benches/word_bounds.rs b/benches/word_bounds.rs
@@ -27,7 +27,7 @@ fn bench_all(c: &mut Criterion) {
     for file in FILES {
         group.bench_with_input(
             BenchmarkId::new("grapheme", file),
-            &fs::read_to_string(format!("benches/texts/{}.txt", file)).unwrap(),
+            &fs::read_to_string(format!("benches/texts/{file}.txt",)).unwrap(),
             |b, content| b.iter(|| grapheme(content)),
         );
     }

diff --git a/benches/words.rs b/benches/words.rs
@@ -41,15 +41,15 @@ fn bench_all(c: &mut Criterion) {
     for file in FILES {
         group.bench_with_input(
             BenchmarkId::new("grapheme", file),
-            &fs::read_to_string(format!("benches/texts/{}.txt", file)).unwrap(),
+            &fs::read_to_string(format!("benches/texts/{file}.txt")).unwrap(),
             |b, content| b.iter(|| grapheme(content)),
         );
     }
 
     for file in FILES {
         group.bench_with_input(
             BenchmarkId::new("scalar", file),
-            &fs::read_to_string(format!("benches/texts/{}.txt", file)).unwrap(),
+            &fs::read_to_string(format!("benches/texts/{file}.txt")).unwrap(),
             |b, content| b.iter(|| scalar(content)),
         );
     }

diff --git a/src/lib.rs b/src/lib.rs
@@ -56,11 +56,14 @@
 )]
 #![no_std]
 
+#[cfg(test)]
+extern crate std;
+
 pub use grapheme::{GraphemeCursor, GraphemeIncomplete};
 pub use grapheme::{GraphemeIndices, Graphemes};
 pub use sentence::{USentenceBoundIndices, USentenceBounds, UnicodeSentences};
 pub use tables::UNICODE_VERSION;
-pub use word::{UWordBoundIndices, UWordBounds, UnicodeWordIndices, UnicodeWords};
+pub use word::{UWordBoundIndices, UWordBounds};
 
 mod grapheme;
 mod sentence;
@@ -133,7 +136,7 @@ pub trait UnicodeSegmentation {
     ///
     /// assert_eq!(&uw1[..], b);
     /// ```
-    fn unicode_words(&self) -> UnicodeWords<'_>;
+    fn unicode_words(&self) -> impl Iterator<Item = &'_ str>;
 
     /// Returns an iterator over the words of `self`, separated on
     /// [UAX#29 word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries), and their
@@ -157,7 +160,7 @@ pub trait UnicodeSegmentation {
     ///
     /// assert_eq!(&uwi1[..], b);
     /// ```
-    fn unicode_word_indices(&self) -> UnicodeWordIndices<'_>;
+    fn unicode_word_indices(&self) -> impl Iterator<Item = (usize, &'_ str)>;
 
     /// Returns an iterator over substrings of `self` separated on
     /// [UAX#29 word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries).
@@ -173,7 +176,7 @@ pub trait UnicodeSegmentation {
     ///
     /// assert_eq!(&swu1[..], b);
     /// ```
-    fn split_word_bounds(&self) -> UWordBounds<'_>;
+    fn split_word_bounds(&self) -> impl DoubleEndedIterator<Item = &'_ str>;
 
     /// Returns an iterator over substrings of `self`, split on UAX#29 word boundaries,
     /// and their offsets. See `split_word_bounds()` for more information.
@@ -188,7 +191,7 @@ pub trait UnicodeSegmentation {
     ///
     /// assert_eq!(&swi1[..], b);
     /// ```
-    fn split_word_bound_indices(&self) -> UWordBoundIndices<'_>;
+    fn split_word_bound_indices(&self) -> impl DoubleEndedIterator<Item = (usize, &'_ str)>;
 
     /// Returns an iterator over substrings of `self` separated on
     /// [UAX#29 sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries).
@@ -210,7 +213,7 @@ pub trait UnicodeSegmentation {
     ///
     /// assert_eq!(&us1[..], b);
     /// ```
-    fn unicode_sentences(&self) -> UnicodeSentences<'_>;
+    fn unicode_sentences(&self) -> impl Iterator<Item = &'_ str>;
 
     /// Returns an iterator over substrings of `self` separated on
     /// [UAX#29 sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries).
@@ -258,27 +261,27 @@ impl UnicodeSegmentation for str {
     }
 
     #[inline]
-    fn unicode_words(&self) -> UnicodeWords {
+    fn unicode_words(&self) -> impl Iterator<Item = &'_ str> {
         word::new_unicode_words(self)
     }
 
     #[inline]
-    fn unicode_word_indices(&self) -> UnicodeWordIndices {
+    fn unicode_word_indices(&self) -> impl Iterator<Item = (usize, &'_ str)> {
         word::new_unicode_word_indices(self)
     }
 
     #[inline]
-    fn split_word_bounds(&self) -> UWordBounds {
+    fn split_word_bounds(&self) -> impl DoubleEndedIterator<Item = &'_ str> {
         word::new_word_bounds(self)
     }
 
     #[inline]
-    fn split_word_bound_indices(&self) -> UWordBoundIndices {
+    fn split_word_bound_indices(&self) -> impl DoubleEndedIterator<Item = (usize, &'_ str)> {
         word::new_word_bound_indices(self)
     }
 
     #[inline]
-    fn unicode_sentences(&self) -> UnicodeSentences {
+    fn unicode_sentences(&self) -> impl Iterator<Item = &'_ str> {
         sentence::new_unicode_sentences(self)
     }
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		2018-07-12 13:59:01 UTC \| ERROR \| (worker.go:131 in process) \| Too many errors for endpoint 'dummy/api/v1/check_run?api_key=*************************00000': retrying later