Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ no_std = [] # This is a no-op, preserved for backward compatibility only.
[dev-dependencies]
quickcheck = "0.7"
criterion = "0.5"
proptest = "1.7.0"

[[bench]]
name = "chars"
Expand All @@ -36,3 +37,8 @@ harness = false
[[bench]]
name = "word_bounds"
harness = false

[[bench]]
name = "unicode_word_indices"
harness = false

4 changes: 2 additions & 2 deletions benches/chars.rs
Original file line number Diff line number Diff line change
Expand Up @@ -41,15 +41,15 @@ fn bench_all(c: &mut Criterion) {
for file in FILES {
group.bench_with_input(
BenchmarkId::new("grapheme", file),
&fs::read_to_string(format!("benches/texts/{}.txt", file)).unwrap(),
&fs::read_to_string(format!("benches/texts/{file}.txt")).unwrap(),
|b, content| b.iter(|| grapheme(content)),
);
}

for file in FILES {
group.bench_with_input(
BenchmarkId::new("scalar", file),
&fs::read_to_string(format!("benches/texts/{}.txt", file)).unwrap(),
&fs::read_to_string(format!("benches/texts/{file}.txt")).unwrap(),
|b, content| b.iter(|| scalar(content)),
);
}
Expand Down
1 change: 1 addition & 0 deletions benches/texts/log.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
2018-07-12 13:59:01 UTC | ERROR | (worker.go:131 in process) | Too many errors for endpoint 'dummy/api/v1/check_run?api_key=*************************00000': retrying later
37 changes: 37 additions & 0 deletions benches/unicode_word_indices.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion};

use std::fs;
use unicode_segmentation::UnicodeSegmentation;

const FILES: &[&str] = &[
"log", //"arabic",
"english",
//"hindi",
"japanese",
//"korean",
//"mandarin",
//"russian",
//"source_code",
];

#[inline(always)]
fn grapheme(text: &str) {
for w in text.unicode_word_indices() {
black_box(w);
}
}

fn bench_all(c: &mut Criterion) {
let mut group = c.benchmark_group("unicode_word_indices");

for file in FILES {
let input = fs::read_to_string(format!("benches/texts/{file}.txt")).unwrap();
group.throughput(criterion::Throughput::Bytes(input.len() as u64));
group.bench_with_input(BenchmarkId::from_parameter(file), &input, |b, content| {
b.iter(|| grapheme(content))
});
}
}

criterion_group!(benches, bench_all);
criterion_main!(benches);
2 changes: 1 addition & 1 deletion benches/word_bounds.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ fn bench_all(c: &mut Criterion) {
for file in FILES {
group.bench_with_input(
BenchmarkId::new("grapheme", file),
&fs::read_to_string(format!("benches/texts/{}.txt", file)).unwrap(),
&fs::read_to_string(format!("benches/texts/{file}.txt",)).unwrap(),
|b, content| b.iter(|| grapheme(content)),
);
}
Expand Down
4 changes: 2 additions & 2 deletions benches/words.rs
Original file line number Diff line number Diff line change
Expand Up @@ -41,15 +41,15 @@ fn bench_all(c: &mut Criterion) {
for file in FILES {
group.bench_with_input(
BenchmarkId::new("grapheme", file),
&fs::read_to_string(format!("benches/texts/{}.txt", file)).unwrap(),
&fs::read_to_string(format!("benches/texts/{file}.txt")).unwrap(),
|b, content| b.iter(|| grapheme(content)),
);
}

for file in FILES {
group.bench_with_input(
BenchmarkId::new("scalar", file),
&fs::read_to_string(format!("benches/texts/{}.txt", file)).unwrap(),
&fs::read_to_string(format!("benches/texts/{file}.txt")).unwrap(),
|b, content| b.iter(|| scalar(content)),
);
}
Expand Down
25 changes: 14 additions & 11 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -56,11 +56,14 @@
)]
#![no_std]

#[cfg(test)]
extern crate std;

pub use grapheme::{GraphemeCursor, GraphemeIncomplete};
pub use grapheme::{GraphemeIndices, Graphemes};
pub use sentence::{USentenceBoundIndices, USentenceBounds, UnicodeSentences};
pub use tables::UNICODE_VERSION;
pub use word::{UWordBoundIndices, UWordBounds, UnicodeWordIndices, UnicodeWords};
pub use word::{UWordBoundIndices, UWordBounds};

mod grapheme;
mod sentence;
Expand Down Expand Up @@ -133,7 +136,7 @@ pub trait UnicodeSegmentation {
///
/// assert_eq!(&uw1[..], b);
/// ```
fn unicode_words(&self) -> UnicodeWords<'_>;
fn unicode_words(&self) -> impl Iterator<Item = &'_ str>;

/// Returns an iterator over the words of `self`, separated on
/// [UAX#29 word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries), and their
Expand All @@ -157,7 +160,7 @@ pub trait UnicodeSegmentation {
///
/// assert_eq!(&uwi1[..], b);
/// ```
fn unicode_word_indices(&self) -> UnicodeWordIndices<'_>;
fn unicode_word_indices(&self) -> impl Iterator<Item = (usize, &'_ str)>;

/// Returns an iterator over substrings of `self` separated on
/// [UAX#29 word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries).
Expand All @@ -173,7 +176,7 @@ pub trait UnicodeSegmentation {
///
/// assert_eq!(&swu1[..], b);
/// ```
fn split_word_bounds(&self) -> UWordBounds<'_>;
fn split_word_bounds(&self) -> impl DoubleEndedIterator<Item = &'_ str>;

/// Returns an iterator over substrings of `self`, split on UAX#29 word boundaries,
/// and their offsets. See `split_word_bounds()` for more information.
Expand All @@ -188,7 +191,7 @@ pub trait UnicodeSegmentation {
///
/// assert_eq!(&swi1[..], b);
/// ```
fn split_word_bound_indices(&self) -> UWordBoundIndices<'_>;
fn split_word_bound_indices(&self) -> impl DoubleEndedIterator<Item = (usize, &'_ str)>;

/// Returns an iterator over substrings of `self` separated on
/// [UAX#29 sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries).
Expand All @@ -210,7 +213,7 @@ pub trait UnicodeSegmentation {
///
/// assert_eq!(&us1[..], b);
/// ```
fn unicode_sentences(&self) -> UnicodeSentences<'_>;
fn unicode_sentences(&self) -> impl Iterator<Item = &'_ str>;

/// Returns an iterator over substrings of `self` separated on
/// [UAX#29 sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries).
Expand Down Expand Up @@ -258,27 +261,27 @@ impl UnicodeSegmentation for str {
}

#[inline]
fn unicode_words(&self) -> UnicodeWords {
fn unicode_words(&self) -> impl Iterator<Item = &'_ str> {
word::new_unicode_words(self)
}

#[inline]
fn unicode_word_indices(&self) -> UnicodeWordIndices {
fn unicode_word_indices(&self) -> impl Iterator<Item = (usize, &'_ str)> {
word::new_unicode_word_indices(self)
}

#[inline]
fn split_word_bounds(&self) -> UWordBounds {
fn split_word_bounds(&self) -> impl DoubleEndedIterator<Item = &'_ str> {
word::new_word_bounds(self)
}

#[inline]
fn split_word_bound_indices(&self) -> UWordBoundIndices {
fn split_word_bound_indices(&self) -> impl DoubleEndedIterator<Item = (usize, &'_ str)> {
word::new_word_bound_indices(self)
}

#[inline]
fn unicode_sentences(&self) -> UnicodeSentences {
fn unicode_sentences(&self) -> impl Iterator<Item = &'_ str> {
sentence::new_unicode_sentences(self)
}

Expand Down
Loading