diff --git a/regex-automata/src/meta/regex.rs b/regex-automata/src/meta/regex.rs index 8cfdecbec..6bc4bdc71 100644 --- a/regex-automata/src/meta/regex.rs +++ b/regex-automata/src/meta/regex.rs @@ -611,7 +611,8 @@ impl Regex { &'r self, input: I, ) -> FindMatches<'r, 'h> { - let cache = self.pool.get(); + let mut cache = self.pool.get(); + cache.keep_lookaround_state(true); let it = iter::Searcher::new(input.into()); FindMatches { re: self, cache, it } } @@ -652,7 +653,8 @@ impl Regex { &'r self, input: I, ) -> CapturesMatches<'r, 'h> { - let cache = self.pool.get(); + let mut cache = self.pool.get(); + cache.keep_lookaround_state(true); let caps = self.create_captures(); let it = iter::Searcher::new(input.into()); CapturesMatches { re: self, cache, caps, it } @@ -2076,7 +2078,11 @@ impl<'r, 'h> Iterator for FindMatches<'r, 'h> { #[inline] fn next(&mut self) -> Option { let FindMatches { re, ref mut cache, ref mut it } = *self; - it.advance(|input| Ok(re.search_with(cache, input))) + let result = it.advance(|input| Ok(re.search_with(cache, input))); + if result.is_none() { + cache.keep_lookaround_state(false); + } + result } #[inline] @@ -2149,6 +2155,7 @@ impl<'r, 'h> Iterator for CapturesMatches<'r, 'h> { if caps.is_match() { Some(caps.clone()) } else { + cache.keep_lookaround_state(false); None } } @@ -2385,6 +2392,19 @@ impl Cache { re.imp.strat.reset_cache(self) } + /// Set this cache to keep the state of look-behind assertions upon a + /// match being found. + /// + /// This must only be called with a value of `true` when a new search is + /// started at the end of a previously found match, otherwise the result + /// of any search after this call will most likely be wrong. + /// + /// Calling this function with a value of `false` will clear any previously + /// stored look-behind state. + pub fn keep_lookaround_state(&mut self, keep: bool) { + self.pikevm.keep_lookaround_state(keep); + } + /// Returns the heap memory usage, in bytes, of this cache. /// /// This does **not** include the stack size used up by this cache. To diff --git a/regex-automata/src/meta/wrappers.rs b/regex-automata/src/meta/wrappers.rs index f7c5c1096..83f5c12ab 100644 --- a/regex-automata/src/meta/wrappers.rs +++ b/regex-automata/src/meta/wrappers.rs @@ -133,6 +133,12 @@ impl PikeVMCache { PikeVMCache(Some(builder.get().0.create_cache())) } + pub(crate) fn keep_lookaround_state(&mut self, keep: bool) { + if let Some(cache) = self.0.as_mut() { + cache.keep_lookaround_state(keep); + } + } + pub(crate) fn reset(&mut self, builder: &PikeVM) { self.0.as_mut().unwrap().reset(&builder.get().0); } diff --git a/regex-automata/src/nfa/thompson/compiler.rs b/regex-automata/src/nfa/thompson/compiler.rs index 5a7bccd72..42dd32127 100644 --- a/regex-automata/src/nfa/thompson/compiler.rs +++ b/regex-automata/src/nfa/thompson/compiler.rs @@ -1038,12 +1038,12 @@ impl Compiler { let unanchored = self.c_at_least(&Hir::dot(hir::Dot::AnyByte), false, 0)?; + self.builder.borrow_mut().start_look_behind(unanchored.start); let sub = self.c(lookaround.sub())?; let write = self.add_write_lookaround(idx)?; self.patch(unanchored.end, sub.start)?; self.patch(sub.end, write)?; - self.builder.borrow_mut().start_look_behind(unanchored.start); Ok(ThompsonRef { start: check, end: check }) } diff --git a/regex-automata/src/nfa/thompson/pikevm.rs b/regex-automata/src/nfa/thompson/pikevm.rs index b3e6e45c9..813804884 100644 --- a/regex-automata/src/nfa/thompson/pikevm.rs +++ b/regex-automata/src/nfa/thompson/pikevm.rs @@ -891,6 +891,7 @@ impl PikeVM { cache: &'c mut Cache, input: I, ) -> FindMatches<'r, 'c, 'h> { + cache.keep_lookaround_state(true); let caps = Captures::matches(self.get_nfa().group_info().clone()); let it = iter::Searcher::new(input.into()); FindMatches { re: self, cache, caps, it } @@ -934,6 +935,7 @@ impl PikeVM { cache: &'c mut Cache, input: I, ) -> CapturesMatches<'r, 'c, 'h> { + cache.keep_lookaround_state(true); let caps = self.create_captures(); let it = iter::Searcher::new(input.into()); CapturesMatches { re: self, cache, caps, it } @@ -1265,42 +1267,50 @@ impl PikeVM { ref mut lookaround, ref mut curr_lookaround, ref mut next_lookaround, + ref mut match_lookaround, + ref keep_lookaround_state, } = cache; - // This initializes the look-behind threads from the start of the input - // Note: since capture groups are not allowed inside look-behinds, - // there won't be any Capture epsilon transitions and hence it is ok to - // use &mut [] for the slots parameter. We need to add the start states - // in reverse because nested look-behinds have a higher index but must - // be executed first. - for look_behind_start in self.nfa.look_behind_starts() { - self.epsilon_closure( - stack, - &mut [], - curr_lookaround, - lookaround, - input, - 0, - *look_behind_start, - ); - } + if let Some(active) = match_lookaround { + *curr_lookaround = active.clone(); + } else if self.lookaround_count() > 0 { + // This initializes the look-behind threads from the start of the input + // Note: since capture groups are not allowed inside look-behinds, + // there won't be any Capture epsilon transitions and hence it is ok to + // use &mut [] for the slots parameter. We need to add the start states + // in reverse because more deeply nested look-behinds have a higher index + // but must be executed first, so that the result is available for the + // outer expression. + for look_behind_start in self.nfa.look_behind_starts().iter().rev() + { + self.epsilon_closure( + stack, + &mut [], + curr_lookaround, + lookaround, + input, + 0, + *look_behind_start, + ); + } - // This brings the look-behind threads into the state they must be for - // starting at input.start() instead of the beginning. This is - // necessary for look-behinds to be able to match outside of the input - // span. - for lb_at in 0..input.start() { - self.nexts( - stack, - curr_lookaround, - next_lookaround, - lookaround, - input, - lb_at, - &mut [], - ); - core::mem::swap(curr_lookaround, next_lookaround); - next_lookaround.set.clear(); + // This brings the look-behind threads into the state they must be for + // starting at input.start() instead of the beginning. This is + // necessary for lookbehinds to be able to match outside of the input + // span. + for lb_at in 0..input.start() { + self.nexts( + stack, + curr_lookaround, + next_lookaround, + lookaround, + input, + lb_at, + &mut [], + ); + core::mem::swap(curr_lookaround, next_lookaround); + next_lookaround.set.clear(); + } } let mut hm = None; @@ -1428,6 +1438,9 @@ impl PikeVM { self.nexts(stack, curr, next, lookaround, input, at, slots) { hm = Some(HalfMatch::new(pid, at)); + if *keep_lookaround_state { + *match_lookaround = Some(curr_lookaround.clone()); + } } // Unless the caller asked us to return early, we need to mush on // to see if we can extend our match. (But note that 'nexts' will @@ -1496,6 +1509,11 @@ impl PikeVM { ref mut lookaround, ref mut curr_lookaround, ref mut next_lookaround, + // It makes no sense to keep any look-behind state for this version of + // the search, since the caller receives no information about + // where the search ended. + keep_lookaround_state: _, + match_lookaround: _, } = cache; for look_behind_start in self.nfa.look_behind_starts() { @@ -1989,10 +2007,14 @@ impl<'r, 'c, 'h> Iterator for FindMatches<'r, 'c, 'h> { *self; // 'advance' converts errors into panics, which is OK here because // the PikeVM can never return an error. - it.advance(|input| { + let result = it.advance(|input| { re.search(cache, input, caps); Ok(caps.get_match()) - }) + }); + if result.is_none() { + cache.keep_lookaround_state(false); + } + result } } @@ -2034,6 +2056,7 @@ impl<'r, 'c, 'h> Iterator for CapturesMatches<'r, 'c, 'h> { if caps.is_match() { Some(caps.clone()) } else { + cache.keep_lookaround_state(false); None } } @@ -2070,6 +2093,14 @@ pub struct Cache { curr_lookaround: ActiveStates, /// The next set of states to be explored for look-behind subexpressions. next_lookaround: ActiveStates, + /// The set of active threads, belonging to look-behind expressions, + /// when a match was found. This is needed to resume a search after a match + /// was found (to look for further matches), without having to re-scan the + /// beginning of the haystack. + match_lookaround: Option, + /// When true, use the states of `match_lookaround` to initialize a search, + /// otherwise recompute from the beginning of the haystack. + keep_lookaround_state: bool, } impl Cache { @@ -2089,6 +2120,8 @@ impl Cache { lookaround: vec![None; re.lookaround_count()], curr_lookaround: ActiveStates::new(re), next_lookaround: ActiveStates::new(re), + match_lookaround: None, + keep_lookaround_state: false, } } @@ -2135,6 +2168,25 @@ impl Cache { self.curr_lookaround.reset(re); self.next_lookaround.reset(re); self.lookaround = vec![None; re.lookaround_count()]; + self.match_lookaround = None; + self.keep_lookaround_state = false; + } + + /// Set this cache to store a copy of the active threads belonging + /// to look-behind assertions upon a match being found. + /// + /// This is a performance optimization and must only be called with a + /// value of `true` when intending to start a new search at the end of + /// a previously found match. Otherwise, the result of look-behind + /// sub-expressions will be out of sync with the main regex. + /// + /// Calling this function with a value of `false` will clear any previously + /// stored look-behind state. + pub fn keep_lookaround_state(&mut self, keep: bool) { + self.keep_lookaround_state = keep; + if !keep { + self.match_lookaround = None; + } } /// Returns the heap memory usage, in bytes, of this cache. @@ -2143,11 +2195,16 @@ impl Cache { /// compute that, use `std::mem::size_of::()`. pub fn memory_usage(&self) -> usize { use core::mem::size_of; + let match_lookaround_memory = match &self.match_lookaround { + Some(ml) => ml.memory_usage(), + None => 0, + }; (self.stack.len() * size_of::()) + self.curr.memory_usage() + self.next.memory_usage() + self.curr_lookaround.memory_usage() + self.next_lookaround.memory_usage() + + match_lookaround_memory } /// Clears this cache. This should be called at the start of every search