Skip to content

Commit 2419b12

Browse files
Implement matchall performance improvement
1 parent 51dd1a4 commit 2419b12

File tree

1 file changed

+86
-35
lines changed

1 file changed

+86
-35
lines changed

regex-automata/src/nfa/thompson/pikevm.rs

Lines changed: 86 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -891,6 +891,7 @@ impl PikeVM {
891891
cache: &'c mut Cache,
892892
input: I,
893893
) -> FindMatches<'r, 'c, 'h> {
894+
cache.keep_lookaround_state(true);
894895
let caps = Captures::matches(self.get_nfa().group_info().clone());
895896
let it = iter::Searcher::new(input.into());
896897
FindMatches { re: self, cache, caps, it }
@@ -934,6 +935,7 @@ impl PikeVM {
934935
cache: &'c mut Cache,
935936
input: I,
936937
) -> CapturesMatches<'r, 'c, 'h> {
938+
cache.keep_lookaround_state(true);
937939
let caps = self.create_captures();
938940
let it = iter::Searcher::new(input.into());
939941
CapturesMatches { re: self, cache, caps, it }
@@ -1265,42 +1267,48 @@ impl PikeVM {
12651267
ref mut lookaround,
12661268
ref mut curr_lookaround,
12671269
ref mut next_lookaround,
1270+
ref mut match_lookaround,
1271+
ref keep_lookaround_state,
12681272
} = cache;
12691273

1270-
// This initializes the look-behind threads from the start of the input
1271-
// Note: since capture groups are not allowed inside look-behinds,
1272-
// there won't be any Capture epsilon transitions and hence it is ok to
1273-
// use &mut [] for the slots parameter. We need to add the start states
1274-
// in reverse because nested look-behinds have a higher index but must
1275-
// be executed first.
1276-
for look_behind_start in self.nfa.look_behind_starts() {
1277-
self.epsilon_closure(
1278-
stack,
1279-
&mut [],
1280-
curr_lookaround,
1281-
lookaround,
1282-
input,
1283-
0,
1284-
*look_behind_start,
1285-
);
1286-
}
1274+
if let Some(active) = match_lookaround {
1275+
*curr_lookaround = active.clone();
1276+
} else {
1277+
// This initializes the look-behind threads from the start of the input
1278+
// Note: since capture groups are not allowed inside look-behinds,
1279+
// there won't be any Capture epsilon transitions and hence it is ok to
1280+
// use &mut [] for the slots parameter. We need to add the start states
1281+
// in reverse because nested look-behinds have a higher index but must
1282+
// be executed first.
1283+
for look_behind_start in self.nfa.look_behind_starts() {
1284+
self.epsilon_closure(
1285+
stack,
1286+
&mut [],
1287+
curr_lookaround,
1288+
lookaround,
1289+
input,
1290+
0,
1291+
*look_behind_start,
1292+
);
1293+
}
12871294

1288-
// This brings the look-behind threads into the state they must be for
1289-
// starting at input.start() instead of the beginning. This is
1290-
// necessary for look-behinds to be able to match outside of the input
1291-
// span.
1292-
for lb_at in 0..input.start() {
1293-
self.nexts(
1294-
stack,
1295-
curr_lookaround,
1296-
next_lookaround,
1297-
lookaround,
1298-
input,
1299-
lb_at,
1300-
&mut [],
1301-
);
1302-
core::mem::swap(curr_lookaround, next_lookaround);
1303-
next_lookaround.set.clear();
1295+
// This brings the look-behind threads into the state they must be for
1296+
// starting at input.start() instead of the beginning. This is
1297+
// necessary for lookbehinds to be able to match outside of the input
1298+
// span.
1299+
for lb_at in 0..input.start() {
1300+
self.nexts(
1301+
stack,
1302+
curr_lookaround,
1303+
next_lookaround,
1304+
lookaround,
1305+
input,
1306+
lb_at,
1307+
&mut [],
1308+
);
1309+
core::mem::swap(curr_lookaround, next_lookaround);
1310+
next_lookaround.set.clear();
1311+
}
13041312
}
13051313

13061314
let mut hm = None;
@@ -1428,6 +1436,9 @@ impl PikeVM {
14281436
self.nexts(stack, curr, next, lookaround, input, at, slots)
14291437
{
14301438
hm = Some(HalfMatch::new(pid, at));
1439+
if *keep_lookaround_state {
1440+
*match_lookaround = Some(curr_lookaround.clone());
1441+
}
14311442
}
14321443
// Unless the caller asked us to return early, we need to mush on
14331444
// to see if we can extend our match. (But note that 'nexts' will
@@ -1496,6 +1507,10 @@ impl PikeVM {
14961507
ref mut lookaround,
14971508
ref mut curr_lookaround,
14981509
ref mut next_lookaround,
1510+
// It makes no sense to keep any look-behind state for this version of
1511+
// the search, since the caller receives no information about
1512+
// where the search ended.
1513+
..
14991514
} = cache;
15001515

15011516
for look_behind_start in self.nfa.look_behind_starts() {
@@ -1989,10 +2004,14 @@ impl<'r, 'c, 'h> Iterator for FindMatches<'r, 'c, 'h> {
19892004
*self;
19902005
// 'advance' converts errors into panics, which is OK here because
19912006
// the PikeVM can never return an error.
1992-
it.advance(|input| {
2007+
let result = it.advance(|input| {
19932008
re.search(cache, input, caps);
19942009
Ok(caps.get_match())
1995-
})
2010+
});
2011+
if result.is_none() {
2012+
cache.keep_lookaround_state(false);
2013+
}
2014+
result
19962015
}
19972016
}
19982017

@@ -2034,6 +2053,7 @@ impl<'r, 'c, 'h> Iterator for CapturesMatches<'r, 'c, 'h> {
20342053
if caps.is_match() {
20352054
Some(caps.clone())
20362055
} else {
2056+
cache.keep_lookaround_state(false);
20372057
None
20382058
}
20392059
}
@@ -2070,6 +2090,12 @@ pub struct Cache {
20702090
curr_lookaround: ActiveStates,
20712091
/// The next set of states to be explored for look-behind subexpressions.
20722092
next_lookaround: ActiveStates,
2093+
/// The active set of states when a match was found. This is needed
2094+
/// to resume a search without recomputing look-behind subexpressions.
2095+
match_lookaround: Option<ActiveStates>,
2096+
/// When true, use the states of `match_lookaround` to initialize a search,
2097+
/// otherwise recompute from the beginning of the haystack.
2098+
keep_lookaround_state: bool,
20732099
}
20742100

20752101
impl Cache {
@@ -2089,6 +2115,8 @@ impl Cache {
20892115
lookaround: vec![None; re.lookaround_count()],
20902116
curr_lookaround: ActiveStates::new(re),
20912117
next_lookaround: ActiveStates::new(re),
2118+
match_lookaround: None,
2119+
keep_lookaround_state: false,
20922120
}
20932121
}
20942122

@@ -2135,6 +2163,24 @@ impl Cache {
21352163
self.curr_lookaround.reset(re);
21362164
self.next_lookaround.reset(re);
21372165
self.lookaround = vec![None; re.lookaround_count()];
2166+
self.match_lookaround = None;
2167+
self.keep_lookaround_state = false;
2168+
}
2169+
2170+
/// Set this cache to keep the state of look-behind assertions upon a
2171+
/// match being found.
2172+
///
2173+
/// This must only be called with a value of `true` when a new search is
2174+
/// started at the end of a previously found match, otherwise the result
2175+
/// of any search after this call will most likely be wrong.
2176+
///
2177+
/// Calling this function with a value of `false` will clear any previously
2178+
/// stored look-behind state.
2179+
pub fn keep_lookaround_state(&mut self, keep: bool) {
2180+
self.keep_lookaround_state = keep;
2181+
if !keep {
2182+
self.match_lookaround = None;
2183+
}
21382184
}
21392185

21402186
/// Returns the heap memory usage, in bytes, of this cache.
@@ -2143,11 +2189,16 @@ impl Cache {
21432189
/// compute that, use `std::mem::size_of::<Cache>()`.
21442190
pub fn memory_usage(&self) -> usize {
21452191
use core::mem::size_of;
2192+
let match_lookaround_memory = match &self.match_lookaround {
2193+
Some(ml) => ml.memory_usage(),
2194+
None => 0,
2195+
};
21462196
(self.stack.len() * size_of::<FollowEpsilon>())
21472197
+ self.curr.memory_usage()
21482198
+ self.next.memory_usage()
21492199
+ self.curr_lookaround.memory_usage()
21502200
+ self.next_lookaround.memory_usage()
2201+
+ match_lookaround_memory
21512202
}
21522203

21532204
/// Clears this cache. This should be called at the start of every search

0 commit comments

Comments
 (0)