diff --git a/Cargo.toml b/Cargo.toml index 404f053..1aac6ea 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -24,6 +24,7 @@ no_std = [] # This is a no-op, preserved for backward compatibility only. [dev-dependencies] quickcheck = "0.7" criterion = "0.5" +proptest = "1.7.0" [[bench]] name = "chars" @@ -36,3 +37,8 @@ harness = false [[bench]] name = "word_bounds" harness = false + +[[bench]] +name = "unicode_word_indices" +harness = false + diff --git a/benches/chars.rs b/benches/chars.rs index bacffa1..2654a26 100644 --- a/benches/chars.rs +++ b/benches/chars.rs @@ -41,7 +41,7 @@ fn bench_all(c: &mut Criterion) { for file in FILES { group.bench_with_input( BenchmarkId::new("grapheme", file), - &fs::read_to_string(format!("benches/texts/{}.txt", file)).unwrap(), + &fs::read_to_string(format!("benches/texts/{file}.txt")).unwrap(), |b, content| b.iter(|| grapheme(content)), ); } @@ -49,7 +49,7 @@ fn bench_all(c: &mut Criterion) { for file in FILES { group.bench_with_input( BenchmarkId::new("scalar", file), - &fs::read_to_string(format!("benches/texts/{}.txt", file)).unwrap(), + &fs::read_to_string(format!("benches/texts/{file}.txt")).unwrap(), |b, content| b.iter(|| scalar(content)), ); } diff --git a/benches/texts/log.txt b/benches/texts/log.txt new file mode 100644 index 0000000..e18ca32 --- /dev/null +++ b/benches/texts/log.txt @@ -0,0 +1 @@ +2018-07-12 13:59:01 UTC | ERROR | (worker.go:131 in process) | Too many errors for endpoint 'dummy/api/v1/check_run?api_key=*************************00000': retrying later diff --git a/benches/unicode_word_indices.rs b/benches/unicode_word_indices.rs new file mode 100644 index 0000000..4c09404 --- /dev/null +++ b/benches/unicode_word_indices.rs @@ -0,0 +1,37 @@ +use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion}; + +use std::fs; +use unicode_segmentation::UnicodeSegmentation; + +const FILES: &[&str] = &[ + "log", //"arabic", + "english", + //"hindi", + "japanese", + //"korean", + //"mandarin", + //"russian", + //"source_code", +]; + +#[inline(always)] +fn grapheme(text: &str) { + for w in text.unicode_word_indices() { + black_box(w); + } +} + +fn bench_all(c: &mut Criterion) { + let mut group = c.benchmark_group("unicode_word_indices"); + + for file in FILES { + let input = fs::read_to_string(format!("benches/texts/{file}.txt")).unwrap(); + group.throughput(criterion::Throughput::Bytes(input.len() as u64)); + group.bench_with_input(BenchmarkId::from_parameter(file), &input, |b, content| { + b.iter(|| grapheme(content)) + }); + } +} + +criterion_group!(benches, bench_all); +criterion_main!(benches); diff --git a/benches/word_bounds.rs b/benches/word_bounds.rs index 42d50ff..f1af7c4 100644 --- a/benches/word_bounds.rs +++ b/benches/word_bounds.rs @@ -27,7 +27,7 @@ fn bench_all(c: &mut Criterion) { for file in FILES { group.bench_with_input( BenchmarkId::new("grapheme", file), - &fs::read_to_string(format!("benches/texts/{}.txt", file)).unwrap(), + &fs::read_to_string(format!("benches/texts/{file}.txt",)).unwrap(), |b, content| b.iter(|| grapheme(content)), ); } diff --git a/benches/words.rs b/benches/words.rs index 86785d5..508bc9f 100644 --- a/benches/words.rs +++ b/benches/words.rs @@ -41,7 +41,7 @@ fn bench_all(c: &mut Criterion) { for file in FILES { group.bench_with_input( BenchmarkId::new("grapheme", file), - &fs::read_to_string(format!("benches/texts/{}.txt", file)).unwrap(), + &fs::read_to_string(format!("benches/texts/{file}.txt")).unwrap(), |b, content| b.iter(|| grapheme(content)), ); } @@ -49,7 +49,7 @@ fn bench_all(c: &mut Criterion) { for file in FILES { group.bench_with_input( BenchmarkId::new("scalar", file), - &fs::read_to_string(format!("benches/texts/{}.txt", file)).unwrap(), + &fs::read_to_string(format!("benches/texts/{file}.txt")).unwrap(), |b, content| b.iter(|| scalar(content)), ); } diff --git a/src/lib.rs b/src/lib.rs index c8ec5b5..d15ac0b 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -56,11 +56,16 @@ )] #![no_std] +#[cfg(test)] +extern crate std; + pub use grapheme::{GraphemeCursor, GraphemeIncomplete}; pub use grapheme::{GraphemeIndices, Graphemes}; pub use sentence::{USentenceBoundIndices, USentenceBounds, UnicodeSentences}; pub use tables::UNICODE_VERSION; -pub use word::{UWordBoundIndices, UWordBounds, UnicodeWordIndices, UnicodeWords}; +pub use word::{UWordBoundIndices, UWordBounds}; + +use crate::word::{UnicodeWordIndices, UnicodeWords}; mod grapheme; mod sentence; @@ -248,7 +253,7 @@ pub trait UnicodeSegmentation { impl UnicodeSegmentation for str { #[inline] - fn graphemes(&self, is_extended: bool) -> Graphemes { + fn graphemes(&self, is_extended: bool) -> Graphemes<'_> { grapheme::new_graphemes(self, is_extended) } @@ -258,32 +263,32 @@ impl UnicodeSegmentation for str { } #[inline] - fn unicode_words(&self) -> UnicodeWords { + fn unicode_words(&self) -> UnicodeWords<'_> { word::new_unicode_words(self) } #[inline] - fn unicode_word_indices(&self) -> UnicodeWordIndices { + fn unicode_word_indices(&self) -> UnicodeWordIndices<'_> { word::new_unicode_word_indices(self) } #[inline] - fn split_word_bounds(&self) -> UWordBounds { + fn split_word_bounds(&self) -> UWordBounds<'_> { word::new_word_bounds(self) } #[inline] - fn split_word_bound_indices(&self) -> UWordBoundIndices { + fn split_word_bound_indices(&self) -> UWordBoundIndices<'_> { word::new_word_bound_indices(self) } #[inline] - fn unicode_sentences(&self) -> UnicodeSentences { + fn unicode_sentences(&self) -> UnicodeSentences<'_> { sentence::new_unicode_sentences(self) } #[inline] - fn split_sentence_bounds(&self) -> USentenceBounds { + fn split_sentence_bounds(&self) -> USentenceBounds<'_> { sentence::new_sentence_bounds(self) } diff --git a/src/word.rs b/src/word.rs index b2a85ae..1a46b39 100644 --- a/src/word.rs +++ b/src/word.rs @@ -27,26 +27,33 @@ use crate::tables::word::WordCat; /// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html #[derive(Debug)] pub struct UnicodeWords<'a> { - inner: Filter, fn(&&str) -> bool>, + inner: WordsIter<'a>, } impl<'a> Iterator for UnicodeWords<'a> { type Item = &'a str; - #[inline] - fn next(&mut self) -> Option<&'a str> { - self.inner.next() + fn next(&mut self) -> Option { + match &mut self.inner { + WordsIter::Ascii(i) => i.next(), + WordsIter::Unicode(i) => i.next(), + } } - #[inline] fn size_hint(&self) -> (usize, Option) { - self.inner.size_hint() + match &self.inner { + WordsIter::Ascii(i) => i.size_hint(), + WordsIter::Unicode(i) => i.size_hint(), + } } } impl<'a> DoubleEndedIterator for UnicodeWords<'a> { #[inline] - fn next_back(&mut self) -> Option<&'a str> { - self.inner.next_back() + fn next_back(&mut self) -> Option { + match &mut self.inner { + WordsIter::Ascii(i) => i.next_back(), + WordsIter::Unicode(i) => i.next_back(), + } } } @@ -65,27 +72,33 @@ impl<'a> DoubleEndedIterator for UnicodeWords<'a> { /// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html #[derive(Debug)] pub struct UnicodeWordIndices<'a> { - #[allow(clippy::type_complexity)] - inner: Filter, fn(&(usize, &str)) -> bool>, + inner: IndicesIter<'a>, } impl<'a> Iterator for UnicodeWordIndices<'a> { type Item = (usize, &'a str); - #[inline] - fn next(&mut self) -> Option<(usize, &'a str)> { - self.inner.next() + fn next(&mut self) -> Option { + match &mut self.inner { + IndicesIter::Ascii(i) => i.next(), + IndicesIter::Unicode(i) => i.next(), + } } - #[inline] fn size_hint(&self) -> (usize, Option) { - self.inner.size_hint() + match &self.inner { + IndicesIter::Ascii(i) => i.size_hint(), + IndicesIter::Unicode(i) => i.size_hint(), + } } } impl<'a> DoubleEndedIterator for UnicodeWordIndices<'a> { #[inline] - fn next_back(&mut self) -> Option<(usize, &'a str)> { - self.inner.next_back() + fn next_back(&mut self) -> Option { + match &mut self.inner { + IndicesIter::Ascii(i) => i.next_back(), + IndicesIter::Unicode(i) => i.next_back(), + } } } @@ -717,6 +730,246 @@ impl<'a> UWordBounds<'a> { } } +/// ASCII‑fast‑path word‑boundary iterator for strings that contain only ASCII characters. +/// +/// Since we handle only ASCII characters, we can use a much simpler set of +/// word break values than the full Unicode algorithm. +/// https://www.unicode.org/reports/tr29/#Table_Word_Break_Property_Values +/// +/// | Word_Break value | ASCII code points that belong to it | +/// | -----------------| --------------------------------------------------------------- | +/// | CR | U+000D (CR) | +/// | LF | U+000A (LF) | +/// | Newline | U+000B (VT), U+000C (FF) | +/// | Single_Quote | U+0027 (') | +/// | Double_Quote | U+0022 (") | +/// | MidNumLet | U+002E (.) FULL STOP | +/// | MidLetter | U+003A (:) COLON | +/// | MidNum | U+002C (,), U+003B (;) | +/// | Numeric | U+0030 – U+0039 (0 … 9) | +/// | ALetter | U+0041 – U+005A (A … Z), U+0061 – U+007A (a … z) | +/// | ExtendNumLet | U+005F (_) underscore | +/// | WSegSpace | U+0020 (SPACE) | +/// +/// The macro MidNumLetQ boils down to: U+002E (.) FULL STOP and U+0027 (') +/// AHLetter is the same as ALetter, so we don't need to distinguish it. +/// +/// Any other single ASCII byte is its own boundary (the default WB999). +#[derive(Debug)] +struct AsciiWordBoundIter<'a> { + rest: &'a str, + offset: usize, +} + +impl<'a> AsciiWordBoundIter<'a> { + pub fn new(s: &'a str) -> Self { + AsciiWordBoundIter { rest: s, offset: 0 } + } + + #[inline] + fn is_core(b: u8) -> bool { + b.is_ascii_alphanumeric() || b == b'_' + } + + #[inline] + fn is_infix(b: u8, prev: u8, next: u8) -> bool { + match b { + // Numeric separators such as "1,000" or "3.14" (WB11/WB12) + // + // "Numeric (MidNum | MidNumLetQ) Numeric" + b'.' | b',' | b';' | b'\'' if prev.is_ascii_digit() && next.is_ascii_digit() => true, + + // Dot or colon inside an alphabetic word ("e.g.", "http://") (WB6/WB7) + // + // "(MidLetter | MidNumLetQ) AHLetter (MidLetter | MidNumLetQ)" + // MidLetter = b':' + // MidNumLetQ = b'.' | b'\'' + b'\'' | b'.' | b':' if prev.is_ascii_alphabetic() && next.is_ascii_alphabetic() => true, + _ => false, + } + } +} + +impl<'a> Iterator for AsciiWordBoundIter<'a> { + type Item = (usize, &'a str); + + #[inline] + fn next(&mut self) -> Option { + if self.rest.is_empty() { + return None; + } + + let bytes = self.rest.as_bytes(); + let len = bytes.len(); + + // 1) Keep horizontal whitespace together. + // Spec: WB3d joins adjacent *WSegSpace* into a single segment. + if bytes[0] == b' ' { + let mut i = 1; + while i < len && bytes[i] == b' ' { + i += 1; + } + let word = &self.rest[..i]; + let pos = self.offset; + self.rest = &self.rest[i..]; + self.offset += i; + return Some((pos, word)); + } + + // 2) Core-run (letters/digits/underscore + infix) + // Spec: ALetter × ALetter, Numeric × Numeric etc. (WB5–WB13b) + if Self::is_core(bytes[0]) { + let mut i = 1; + while i < len { + let b = bytes[i]; + if Self::is_core(b) + || (i + 1 < len && Self::is_infix(b, bytes[i - 1], bytes[i + 1])) + { + i += 1; + } else { + break; + } + } + let word = &self.rest[..i]; + let pos = self.offset; + self.rest = &self.rest[i..]; + self.offset += i; + return Some((pos, word)); + } + + // 3) Do not break within CRLF. + // Spec: WB3 treats CR+LF as a single non‑breaking pair. + if bytes[0] == b'\r' && len >= 2 && bytes[1] == b'\n' { + let word = &self.rest[..2]; + let pos = self.offset; + self.rest = &self.rest[2..]; + self.offset += 2; + Some((pos, word)) + } else { + // 4) Otherwise, break everywhere + // Spec: the catch‑all rule WB999. + let word = &self.rest[..1]; + let pos = self.offset; + self.rest = &self.rest[1..]; + self.offset += 1; + Some((pos, word)) + } + } +} + +impl<'a> DoubleEndedIterator for AsciiWordBoundIter<'a> { + fn next_back(&mut self) -> Option<(usize, &'a str)> { + let rest = self.rest; + if rest.is_empty() { + return None; + } + let bytes = rest.as_bytes(); + let len = bytes.len(); + + // 1) Group runs of spaces + // Spec: WB3d joins adjacent *WSegSpace* into a single segment. + if bytes[len - 1] == b' ' { + // find start of this last run of spaces + let mut start = len - 1; + while start > 0 && bytes[start - 1] == b' ' { + start -= 1; + } + let word = &rest[start..]; + let pos = self.offset + start; + self.rest = &rest[..start]; + return Some((pos, word)); + } + + // 2) Trailing Core-run (letters/digits/underscore + infix) + // Spec: ALetter × ALetter, Numeric × Numeric etc. (WB5–WB13b) + if Self::is_core(bytes[len - 1]) { + // scan backwards as long as we see `is_core` or an `is_infix` + let mut start = len - 1; + while start > 0 { + let b = bytes[start - 1]; + let prev = if start >= 2 { bytes[start - 2] } else { b }; + let next = bytes[start]; // the byte we just included + if Self::is_core(b) || Self::is_infix(b, prev, next) { + start -= 1; + } else { + break; + } + } + let word = &rest[start..]; + let pos = self.offset + start; + self.rest = &rest[..start]; + return Some((pos, word)); + } + + // 3) Non-core: CR+LF as one token, otherwise single char + // Spec: WB3 treats CR+LF as a single non‑breaking pair. + if len >= 2 && bytes[len - 2] == b'\r' && bytes[len - 1] == b'\n' { + let start = len - 2; + let word = &rest[start..]; + let pos = self.offset + start; + self.rest = &rest[..start]; + return Some((pos, word)); + } + + // 4) Fallback – every other byte is its own segment + // Spec: the catch‑all rule WB999. + let start = len - 1; + let word = &rest[start..]; + let pos = self.offset + start; + self.rest = &rest[..start]; + Some((pos, word)) + } +} + +#[inline] +fn ascii_word_ok(t: &(usize, &str)) -> bool { + has_ascii_alphanumeric(&t.1) +} +#[inline] +fn unicode_word_ok(t: &(usize, &str)) -> bool { + has_alphanumeric(&t.1) +} + +type AsciiWordsIter<'a> = Filter< + core::iter::Map, fn((usize, &'a str)) -> &'a str>, + fn(&&'a str) -> bool, +>; +type UnicodeWordsIter<'a> = Filter, fn(&&'a str) -> bool>; +type AsciiIndicesIter<'a> = Filter, fn(&(usize, &'a str)) -> bool>; +type UnicodeIndicesIter<'a> = Filter, fn(&(usize, &'a str)) -> bool>; + +#[derive(Debug)] +enum WordsIter<'a> { + Ascii(AsciiWordsIter<'a>), + Unicode(UnicodeWordsIter<'a>), +} + +#[derive(Debug)] +enum IndicesIter<'a> { + Ascii(AsciiIndicesIter<'a>), + Unicode(UnicodeIndicesIter<'a>), +} + +#[inline] +pub fn new_unicode_words(s: &str) -> UnicodeWords<'_> { + let inner = if s.is_ascii() { + WordsIter::Ascii(new_unicode_words_ascii(s)) + } else { + WordsIter::Unicode(new_unicode_words_general(s)) + }; + UnicodeWords { inner } +} + +#[inline] +pub fn new_unicode_word_indices(s: &str) -> UnicodeWordIndices<'_> { + let inner = if s.is_ascii() { + IndicesIter::Ascii(new_ascii_word_bound_indices(s).filter(ascii_word_ok)) + } else { + IndicesIter::Unicode(new_word_bound_indices(s).filter(unicode_word_ok)) + }; + UnicodeWordIndices { inner } +} + #[inline] pub fn new_word_bounds(s: &str) -> UWordBounds<'_> { UWordBounds { @@ -734,6 +987,11 @@ pub fn new_word_bound_indices(s: &str) -> UWordBoundIndices<'_> { } } +#[inline] +fn new_ascii_word_bound_indices(s: &str) -> AsciiWordBoundIter<'_> { + AsciiWordBoundIter::new(s) +} + #[inline] fn has_alphanumeric(s: &&str) -> bool { use crate::tables::util::is_alphanumeric; @@ -742,27 +1000,38 @@ fn has_alphanumeric(s: &&str) -> bool { } #[inline] -pub fn new_unicode_words(s: &str) -> UnicodeWords<'_> { - use super::UnicodeSegmentation; +fn has_ascii_alphanumeric(s: &&str) -> bool { + s.chars().any(|c| c.is_ascii_alphanumeric()) +} - UnicodeWords { - inner: s.split_word_bounds().filter(has_alphanumeric), - } +#[inline(always)] +fn strip_pos((_, w): (usize, &str)) -> &str { + w } #[inline] -pub fn new_unicode_word_indices(s: &str) -> UnicodeWordIndices<'_> { - use super::UnicodeSegmentation; +fn new_unicode_words_ascii<'a>(s: &'a str) -> AsciiWordsIter<'a> { + new_ascii_word_bound_indices(s) + .map(strip_pos as fn(_) -> _) + .filter(has_ascii_alphanumeric) +} - UnicodeWordIndices { - inner: s - .split_word_bound_indices() - .filter(|(_, c)| has_alphanumeric(c)), - } +#[inline] +fn new_unicode_words_general<'a>(s: &'a str) -> UnicodeWordsIter<'a> { + new_word_bounds(s).filter(has_alphanumeric) } #[cfg(test)] mod tests { + use crate::word::{ + new_ascii_word_bound_indices, new_unicode_words_ascii, new_word_bound_indices, + }; + use std::string::String; + use std::vec::Vec; + use std::{format, vec}; + + use proptest::prelude::*; + #[test] fn test_syriac_abbr_mark() { use crate::tables::word as wd; @@ -776,4 +1045,70 @@ mod tests { let (_, _, cat) = wd::word_category('\u{6dd}'); assert_eq!(cat, wd::WC_Numeric); } + + #[test] + fn test_ascii_word_bound_indices_various_cases() { + let s = "Hello, world!"; + let words: Vec<(usize, &str)> = new_ascii_word_bound_indices(s).collect(); + let expected = vec![ + (0, "Hello"), // simple letters + (5, ","), + (6, " "), // space after comma + (7, "world"), // skip comma+space, stop at '!' + (12, "!"), // punctuation at the end + ]; + assert_eq!(words, expected); + } + + #[test] + fn test_ascii_word_indices_various_cases() { + let s = "Hello, world! can't e.g. var1 123,456 foo_bar example.com 127.0.0.1:9090"; + let words: Vec<&str> = new_unicode_words_ascii(s).collect(); + let expected = vec![ + ("Hello"), // simple letters + ("world"), // skip comma+space, stop at '!' + ("can't"), // apostrophe joins letters + ("e.g"), + ("var1"), + ("123,456"), // digits+comma+digits + ("foo_bar"), + ("example.com"), + ("127.0.0.1"), + ("9090"), // port number + ]; + assert_eq!(words, expected); + } + + /// Strategy that yields every code-point from NUL (0) to DEL (127). + fn ascii_char() -> impl Strategy { + (0u8..=127).prop_map(|b| b as char) + } + + proptest! { + #![proptest_config(ProptestConfig::with_cases(10000))] + /// Fast path must equal general path for any ASCII input. + #[test] + fn proptest_ascii_matches_unicode_word_indices( + // Vec → String, length 0‒99 + s in proptest::collection::vec(ascii_char(), 0..100) + .prop_map(|v| v.into_iter().collect::()) + ) { + let fast: Vec<(usize, &str)> = new_ascii_word_bound_indices(&s).collect(); + let uni: Vec<(usize, &str)> = new_word_bound_indices(&s).collect(); + + prop_assert_eq!(fast, uni); + } + + /// Fast path must equal general path for any ASCII input, forwards and backwards. + #[test] + fn proptest_ascii_matches_unicode_word_indices_rev( + // Vec → String, length 0‒99 + s in proptest::collection::vec(ascii_char(), 0..100) + .prop_map(|v| v.into_iter().collect::()) + ) { + let fast_rev: Vec<(usize, &str)> = new_ascii_word_bound_indices(&s).rev().collect(); + let uni_rev : Vec<(usize, &str)> = new_word_bound_indices(&s).rev().collect(); + prop_assert_eq!(fast_rev, uni_rev); + } + } }