@@ -22,25 +22,13 @@ use simd_accel::teddy128::{Teddy, is_teddy_128_available};
22
22
/// A prefix extracted from a compiled regular expression.
23
23
///
24
24
/// A regex prefix is a set of literal strings that *must* be matched at the
25
- /// beginning of a regex in order for the entire regex to match.
26
- ///
27
- /// There are a variety of ways to efficiently scan the search text for a
28
- /// prefix. Currently, there are three implemented:
29
- ///
30
- /// 1. The prefix is a single byte. Just use memchr.
31
- /// 2. If the prefix is a set of two or more single byte prefixes, then
32
- /// a single sparse map is created. Checking if there is a match is a lookup
33
- /// in this map for each byte in the search text.
34
- /// 3. In all other cases, build an Aho-Corasick automaton.
35
- ///
36
- /// It's possible that there's room here for other substring algorithms,
37
- /// such as Boyer-Moore for single-set prefixes greater than 1, or Rabin-Karp
38
- /// for small sets of same-length prefixes.
25
+ /// beginning of a regex in order for the entire regex to match. Similarly
26
+ /// for a regex suffix.
39
27
#[ derive( Clone , Debug ) ]
40
28
pub struct LiteralSearcher {
41
29
complete : bool ,
42
- lcp : MemchrSearch ,
43
- lcs : MemchrSearch ,
30
+ lcp : FreqyPacked ,
31
+ lcs : FreqyPacked ,
44
32
matcher : Matcher ,
45
33
}
46
34
@@ -51,12 +39,13 @@ enum Matcher {
51
39
/// A set of four or more single byte literals.
52
40
Bytes ( SingleByteSet ) ,
53
41
/// A single substring, find using memchr and frequency analysis.
54
- SingleMemchr ( MemchrSearch ) ,
42
+ FreqyPacked ( FreqyPacked ) ,
55
43
/// A single substring, find using Boyer-Moore.
56
- SingleBoyerMoore ( BoyerMooreSearch ) ,
44
+ BoyerMoore ( BoyerMooreSearch ) ,
57
45
/// An Aho-Corasick automaton.
58
46
AC ( FullAcAutomaton < syntax:: Lit > ) ,
59
- /// A simd accelerated multiple string matcher.
47
+ /// A simd accelerated multiple string matcher. Used only for a small
48
+ /// number of small literals.
60
49
Teddy128 ( Teddy ) ,
61
50
}
62
51
@@ -82,8 +71,8 @@ impl LiteralSearcher {
82
71
let complete = lits. all_complete ( ) ;
83
72
LiteralSearcher {
84
73
complete : complete,
85
- lcp : MemchrSearch :: new ( lits. longest_common_prefix ( ) . to_vec ( ) ) ,
86
- lcs : MemchrSearch :: new ( lits. longest_common_suffix ( ) . to_vec ( ) ) ,
74
+ lcp : FreqyPacked :: new ( lits. longest_common_prefix ( ) . to_vec ( ) ) ,
75
+ lcs : FreqyPacked :: new ( lits. longest_common_suffix ( ) . to_vec ( ) ) ,
87
76
matcher : matcher,
88
77
}
89
78
}
@@ -105,8 +94,8 @@ impl LiteralSearcher {
105
94
match self . matcher {
106
95
Empty => Some ( ( 0 , 0 ) ) ,
107
96
Bytes ( ref sset) => sset. find ( haystack) . map ( |i| ( i, i + 1 ) ) ,
108
- SingleMemchr ( ref s) => s. find ( haystack) . map ( |i| ( i, i + s. len ( ) ) ) ,
109
- SingleBoyerMoore ( ref s) => s. find ( haystack) . map ( |i| ( i, i + s. len ( ) ) ) ,
97
+ FreqyPacked ( ref s) => s. find ( haystack) . map ( |i| ( i, i + s. len ( ) ) ) ,
98
+ BoyerMoore ( ref s) => s. find ( haystack) . map ( |i| ( i, i + s. len ( ) ) ) ,
110
99
AC ( ref aut) => aut. find ( haystack) . next ( ) . map ( |m| ( m. start , m. end ) ) ,
111
100
Teddy128 ( ref ted) => ted. find ( haystack) . map ( |m| ( m. start , m. end ) ) ,
112
101
}
@@ -143,8 +132,8 @@ impl LiteralSearcher {
143
132
match self . matcher {
144
133
Matcher :: Empty => LiteralIter :: Empty ,
145
134
Matcher :: Bytes ( ref sset) => LiteralIter :: Bytes ( & sset. dense ) ,
146
- Matcher :: SingleMemchr ( ref s) => LiteralIter :: Single ( & s. pat ) ,
147
- Matcher :: SingleBoyerMoore ( ref s) => LiteralIter :: Single ( & s. pattern ) ,
135
+ Matcher :: FreqyPacked ( ref s) => LiteralIter :: Single ( & s. pat ) ,
136
+ Matcher :: BoyerMoore ( ref s) => LiteralIter :: Single ( & s. pattern ) ,
148
137
Matcher :: AC ( ref ac) => LiteralIter :: AC ( ac. patterns ( ) ) ,
149
138
Matcher :: Teddy128 ( ref ted) => {
150
139
LiteralIter :: Teddy128 ( ted. patterns ( ) )
@@ -153,12 +142,12 @@ impl LiteralSearcher {
153
142
}
154
143
155
144
/// Returns a matcher for the longest common prefix of this matcher.
156
- pub fn lcp ( & self ) -> & MemchrSearch {
145
+ pub fn lcp ( & self ) -> & FreqyPacked {
157
146
& self . lcp
158
147
}
159
148
160
149
/// Returns a matcher for the longest common suffix of this matcher.
161
- pub fn lcs ( & self ) -> & MemchrSearch {
150
+ pub fn lcs ( & self ) -> & FreqyPacked {
162
151
& self . lcs
163
152
}
164
153
@@ -173,8 +162,8 @@ impl LiteralSearcher {
173
162
match self . matcher {
174
163
Empty => 0 ,
175
164
Bytes ( ref sset) => sset. dense . len ( ) ,
176
- SingleMemchr ( _) => 1 ,
177
- SingleBoyerMoore ( _) => 1 ,
165
+ FreqyPacked ( _) => 1 ,
166
+ BoyerMoore ( _) => 1 ,
178
167
AC ( ref aut) => aut. len ( ) ,
179
168
Teddy128 ( ref ted) => ted. len ( ) ,
180
169
}
@@ -186,8 +175,8 @@ impl LiteralSearcher {
186
175
match self . matcher {
187
176
Empty => 0 ,
188
177
Bytes ( ref sset) => sset. approximate_size ( ) ,
189
- SingleMemchr ( ref single) => single. approximate_size ( ) ,
190
- SingleBoyerMoore ( ref single) => single. approximate_size ( ) ,
178
+ FreqyPacked ( ref single) => single. approximate_size ( ) ,
179
+ BoyerMoore ( ref single) => single. approximate_size ( ) ,
191
180
AC ( ref aut) => aut. heap_bytes ( ) ,
192
181
Teddy128 ( ref ted) => ted. approximate_size ( ) ,
193
182
}
@@ -224,9 +213,9 @@ impl Matcher {
224
213
if lits. literals ( ) . len ( ) == 1 {
225
214
let lit = lits. literals ( ) [ 0 ] . to_vec ( ) ;
226
215
if BoyerMooreSearch :: should_use ( lit. as_slice ( ) ) {
227
- return Matcher :: SingleBoyerMoore ( BoyerMooreSearch :: new ( lit) ) ;
216
+ return Matcher :: BoyerMoore ( BoyerMooreSearch :: new ( lit) ) ;
228
217
} else {
229
- return Matcher :: SingleMemchr ( MemchrSearch :: new ( lit) ) ;
218
+ return Matcher :: FreqyPacked ( FreqyPacked :: new ( lit) ) ;
230
219
}
231
220
}
232
221
let is_aho_corasick_fast = sset. dense . len ( ) == 1 && sset. all_ascii ;
@@ -387,11 +376,8 @@ impl SingleByteSet {
387
376
}
388
377
}
389
378
390
- /// Provides an implementation of fast subtring search.
391
- ///
392
- /// This particular implementation is a Boyer-Moore variant, based on the
393
- /// "tuned boyer moore" search from (Hume & Sunday, 1991). It has been tweaked
394
- /// slightly to better use memchr.
379
+ /// Provides an implementation of fast subtring search using frequency
380
+ /// analysis.
395
381
///
396
382
/// memchr is so fast that we do everything we can to keep the loop in memchr
397
383
/// for as long as possible. The easiest way to do this is to intelligently
@@ -400,10 +386,8 @@ impl SingleByteSet {
400
386
/// haystack is far too expensive, we compute a set of fixed frequencies up
401
387
/// front and hard code them in src/freqs.rs. Frequency analysis is done via
402
388
/// scripts/frequencies.py.
403
- ///
404
- /// TODO(burntsushi): Add some amount of shifting to this.
405
389
#[ derive( Clone , Debug ) ]
406
- pub struct MemchrSearch {
390
+ pub struct FreqyPacked {
407
391
/// The pattern.
408
392
pat : Vec < u8 > ,
409
393
/// The number of Unicode characters in the pattern. This is useful for
@@ -429,10 +413,10 @@ pub struct MemchrSearch {
429
413
rare2i : usize ,
430
414
}
431
415
432
- impl MemchrSearch {
433
- fn new ( pat : Vec < u8 > ) -> MemchrSearch {
416
+ impl FreqyPacked {
417
+ fn new ( pat : Vec < u8 > ) -> FreqyPacked {
434
418
if pat. is_empty ( ) {
435
- return MemchrSearch :: empty ( ) ;
419
+ return FreqyPacked :: empty ( ) ;
436
420
}
437
421
438
422
// Find the rarest two bytes. Try to make them distinct (but it's not
@@ -457,7 +441,7 @@ impl MemchrSearch {
457
441
let rare2i = pat. iter ( ) . rposition ( |& b| b == rare2) . unwrap ( ) ;
458
442
459
443
let char_len = char_len_lossy ( & pat) ;
460
- MemchrSearch {
444
+ FreqyPacked {
461
445
pat : pat,
462
446
char_len : char_len,
463
447
rare1 : rare1,
@@ -467,8 +451,8 @@ impl MemchrSearch {
467
451
}
468
452
}
469
453
470
- fn empty ( ) -> MemchrSearch {
471
- MemchrSearch {
454
+ fn empty ( ) -> FreqyPacked {
455
+ FreqyPacked {
472
456
pat : vec ! [ ] ,
473
457
char_len : 0 ,
474
458
rare1 : 0 ,
@@ -883,7 +867,7 @@ fn freq_rank(b: u8) -> usize {
883
867
884
868
#[ cfg( test) ]
885
869
mod tests {
886
- use super :: { BoyerMooreSearch , MemchrSearch } ;
870
+ use super :: { BoyerMooreSearch , FreqyPacked } ;
887
871
888
872
//
889
873
// Unit Tests
@@ -1014,9 +998,9 @@ mod tests {
1014
998
} ;
1015
999
1016
1000
let bm_searcher = BoyerMooreSearch :: new( needle. clone( ) ) ;
1017
- let memchr_searcher = MemchrSearch :: new( needle) ;
1001
+ let freqy_memchr = FreqyPacked :: new( needle) ;
1018
1002
TestResult :: from_bool(
1019
- bm_searcher. find( haystack) == memchr_searcher . find( haystack) )
1003
+ bm_searcher. find( haystack) == freqy_memchr . find( haystack) )
1020
1004
}
1021
1005
1022
1006
fn qc_bm_finds_trailing_needle(
0 commit comments