Skip to content

Commit 11f1815

Browse files
committed
literals: rename SingleMemchr to FreqyPacked
As far as I can tell, nobody has actually described a substring search algorithm that used both frequency analysis and vector instructions. So I'm naming it.
1 parent 7f41cd3 commit 11f1815

File tree

1 file changed

+34
-50
lines changed

1 file changed

+34
-50
lines changed

src/literals.rs

Lines changed: 34 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -22,25 +22,13 @@ use simd_accel::teddy128::{Teddy, is_teddy_128_available};
2222
/// A prefix extracted from a compiled regular expression.
2323
///
2424
/// A regex prefix is a set of literal strings that *must* be matched at the
25-
/// beginning of a regex in order for the entire regex to match.
26-
///
27-
/// There are a variety of ways to efficiently scan the search text for a
28-
/// prefix. Currently, there are three implemented:
29-
///
30-
/// 1. The prefix is a single byte. Just use memchr.
31-
/// 2. If the prefix is a set of two or more single byte prefixes, then
32-
/// a single sparse map is created. Checking if there is a match is a lookup
33-
/// in this map for each byte in the search text.
34-
/// 3. In all other cases, build an Aho-Corasick automaton.
35-
///
36-
/// It's possible that there's room here for other substring algorithms,
37-
/// such as Boyer-Moore for single-set prefixes greater than 1, or Rabin-Karp
38-
/// for small sets of same-length prefixes.
25+
/// beginning of a regex in order for the entire regex to match. Similarly
26+
/// for a regex suffix.
3927
#[derive(Clone, Debug)]
4028
pub struct LiteralSearcher {
4129
complete: bool,
42-
lcp: MemchrSearch,
43-
lcs: MemchrSearch,
30+
lcp: FreqyPacked,
31+
lcs: FreqyPacked,
4432
matcher: Matcher,
4533
}
4634

@@ -51,12 +39,13 @@ enum Matcher {
5139
/// A set of four or more single byte literals.
5240
Bytes(SingleByteSet),
5341
/// A single substring, find using memchr and frequency analysis.
54-
SingleMemchr(MemchrSearch),
42+
FreqyPacked(FreqyPacked),
5543
/// A single substring, find using Boyer-Moore.
56-
SingleBoyerMoore(BoyerMooreSearch),
44+
BoyerMoore(BoyerMooreSearch),
5745
/// An Aho-Corasick automaton.
5846
AC(FullAcAutomaton<syntax::Lit>),
59-
/// A simd accelerated multiple string matcher.
47+
/// A simd accelerated multiple string matcher. Used only for a small
48+
/// number of small literals.
6049
Teddy128(Teddy),
6150
}
6251

@@ -82,8 +71,8 @@ impl LiteralSearcher {
8271
let complete = lits.all_complete();
8372
LiteralSearcher {
8473
complete: complete,
85-
lcp: MemchrSearch::new(lits.longest_common_prefix().to_vec()),
86-
lcs: MemchrSearch::new(lits.longest_common_suffix().to_vec()),
74+
lcp: FreqyPacked::new(lits.longest_common_prefix().to_vec()),
75+
lcs: FreqyPacked::new(lits.longest_common_suffix().to_vec()),
8776
matcher: matcher,
8877
}
8978
}
@@ -105,8 +94,8 @@ impl LiteralSearcher {
10594
match self.matcher {
10695
Empty => Some((0, 0)),
10796
Bytes(ref sset) => sset.find(haystack).map(|i| (i, i + 1)),
108-
SingleMemchr(ref s) => s.find(haystack).map(|i| (i, i + s.len())),
109-
SingleBoyerMoore(ref s) => s.find(haystack).map(|i| (i, i + s.len())),
97+
FreqyPacked(ref s) => s.find(haystack).map(|i| (i, i + s.len())),
98+
BoyerMoore(ref s) => s.find(haystack).map(|i| (i, i + s.len())),
11099
AC(ref aut) => aut.find(haystack).next().map(|m| (m.start, m.end)),
111100
Teddy128(ref ted) => ted.find(haystack).map(|m| (m.start, m.end)),
112101
}
@@ -143,8 +132,8 @@ impl LiteralSearcher {
143132
match self.matcher {
144133
Matcher::Empty => LiteralIter::Empty,
145134
Matcher::Bytes(ref sset) => LiteralIter::Bytes(&sset.dense),
146-
Matcher::SingleMemchr(ref s) => LiteralIter::Single(&s.pat),
147-
Matcher::SingleBoyerMoore(ref s) => LiteralIter::Single(&s.pattern),
135+
Matcher::FreqyPacked(ref s) => LiteralIter::Single(&s.pat),
136+
Matcher::BoyerMoore(ref s) => LiteralIter::Single(&s.pattern),
148137
Matcher::AC(ref ac) => LiteralIter::AC(ac.patterns()),
149138
Matcher::Teddy128(ref ted) => {
150139
LiteralIter::Teddy128(ted.patterns())
@@ -153,12 +142,12 @@ impl LiteralSearcher {
153142
}
154143

155144
/// Returns a matcher for the longest common prefix of this matcher.
156-
pub fn lcp(&self) -> &MemchrSearch {
145+
pub fn lcp(&self) -> &FreqyPacked {
157146
&self.lcp
158147
}
159148

160149
/// Returns a matcher for the longest common suffix of this matcher.
161-
pub fn lcs(&self) -> &MemchrSearch {
150+
pub fn lcs(&self) -> &FreqyPacked {
162151
&self.lcs
163152
}
164153

@@ -173,8 +162,8 @@ impl LiteralSearcher {
173162
match self.matcher {
174163
Empty => 0,
175164
Bytes(ref sset) => sset.dense.len(),
176-
SingleMemchr(_) => 1,
177-
SingleBoyerMoore(_) => 1,
165+
FreqyPacked(_) => 1,
166+
BoyerMoore(_) => 1,
178167
AC(ref aut) => aut.len(),
179168
Teddy128(ref ted) => ted.len(),
180169
}
@@ -186,8 +175,8 @@ impl LiteralSearcher {
186175
match self.matcher {
187176
Empty => 0,
188177
Bytes(ref sset) => sset.approximate_size(),
189-
SingleMemchr(ref single) => single.approximate_size(),
190-
SingleBoyerMoore(ref single) => single.approximate_size(),
178+
FreqyPacked(ref single) => single.approximate_size(),
179+
BoyerMoore(ref single) => single.approximate_size(),
191180
AC(ref aut) => aut.heap_bytes(),
192181
Teddy128(ref ted) => ted.approximate_size(),
193182
}
@@ -224,9 +213,9 @@ impl Matcher {
224213
if lits.literals().len() == 1 {
225214
let lit = lits.literals()[0].to_vec();
226215
if BoyerMooreSearch::should_use(lit.as_slice()) {
227-
return Matcher::SingleBoyerMoore(BoyerMooreSearch::new(lit));
216+
return Matcher::BoyerMoore(BoyerMooreSearch::new(lit));
228217
} else {
229-
return Matcher::SingleMemchr(MemchrSearch::new(lit));
218+
return Matcher::FreqyPacked(FreqyPacked::new(lit));
230219
}
231220
}
232221
let is_aho_corasick_fast = sset.dense.len() == 1 && sset.all_ascii;
@@ -387,11 +376,8 @@ impl SingleByteSet {
387376
}
388377
}
389378

390-
/// Provides an implementation of fast subtring search.
391-
///
392-
/// This particular implementation is a Boyer-Moore variant, based on the
393-
/// "tuned boyer moore" search from (Hume & Sunday, 1991). It has been tweaked
394-
/// slightly to better use memchr.
379+
/// Provides an implementation of fast subtring search using frequency
380+
/// analysis.
395381
///
396382
/// memchr is so fast that we do everything we can to keep the loop in memchr
397383
/// for as long as possible. The easiest way to do this is to intelligently
@@ -400,10 +386,8 @@ impl SingleByteSet {
400386
/// haystack is far too expensive, we compute a set of fixed frequencies up
401387
/// front and hard code them in src/freqs.rs. Frequency analysis is done via
402388
/// scripts/frequencies.py.
403-
///
404-
/// TODO(burntsushi): Add some amount of shifting to this.
405389
#[derive(Clone, Debug)]
406-
pub struct MemchrSearch {
390+
pub struct FreqyPacked {
407391
/// The pattern.
408392
pat: Vec<u8>,
409393
/// The number of Unicode characters in the pattern. This is useful for
@@ -429,10 +413,10 @@ pub struct MemchrSearch {
429413
rare2i: usize,
430414
}
431415

432-
impl MemchrSearch {
433-
fn new(pat: Vec<u8>) -> MemchrSearch {
416+
impl FreqyPacked {
417+
fn new(pat: Vec<u8>) -> FreqyPacked {
434418
if pat.is_empty() {
435-
return MemchrSearch::empty();
419+
return FreqyPacked::empty();
436420
}
437421

438422
// Find the rarest two bytes. Try to make them distinct (but it's not
@@ -457,7 +441,7 @@ impl MemchrSearch {
457441
let rare2i = pat.iter().rposition(|&b| b == rare2).unwrap();
458442

459443
let char_len = char_len_lossy(&pat);
460-
MemchrSearch {
444+
FreqyPacked {
461445
pat: pat,
462446
char_len: char_len,
463447
rare1: rare1,
@@ -467,8 +451,8 @@ impl MemchrSearch {
467451
}
468452
}
469453

470-
fn empty() -> MemchrSearch {
471-
MemchrSearch {
454+
fn empty() -> FreqyPacked {
455+
FreqyPacked {
472456
pat: vec![],
473457
char_len: 0,
474458
rare1: 0,
@@ -883,7 +867,7 @@ fn freq_rank(b: u8) -> usize {
883867

884868
#[cfg(test)]
885869
mod tests {
886-
use super::{BoyerMooreSearch, MemchrSearch};
870+
use super::{BoyerMooreSearch, FreqyPacked};
887871

888872
//
889873
// Unit Tests
@@ -1014,9 +998,9 @@ mod tests {
1014998
};
1015999

10161000
let bm_searcher = BoyerMooreSearch::new(needle.clone());
1017-
let memchr_searcher = MemchrSearch::new(needle);
1001+
let freqy_memchr = FreqyPacked::new(needle);
10181002
TestResult::from_bool(
1019-
bm_searcher.find(haystack) == memchr_searcher.find(haystack))
1003+
bm_searcher.find(haystack) == freqy_memchr.find(haystack))
10201004
}
10211005

10221006
fn qc_bm_finds_trailing_needle(

0 commit comments

Comments
 (0)