diff --git a/regex-automata/src/nfa/thompson/builder.rs b/regex-automata/src/nfa/thompson/builder.rs index c769fda23..4f2f9af79 100644 --- a/regex-automata/src/nfa/thompson/builder.rs +++ b/regex-automata/src/nfa/thompson/builder.rs @@ -340,6 +340,8 @@ pub struct Builder { /// contains a single regex, then `start_pattern[0]` and `start_anchored` /// are always equivalent. start_pattern: Vec, + /// The starting states for each individual look-behind sub-expression. + start_look_behind: Vec, /// A map from pattern ID to capture group index to name. (If no name /// exists, then a None entry is present. Thus, all capturing groups are /// present in this mapping.) @@ -385,6 +387,7 @@ impl Builder { self.pattern_id = None; self.states.clear(); self.start_pattern.clear(); + self.start_look_behind.clear(); self.captures.clear(); self.memory_states = 0; } @@ -449,6 +452,7 @@ impl Builder { remap.resize(self.states.len(), StateID::ZERO); nfa.set_starts(start_anchored, start_unanchored, &self.start_pattern); + nfa.set_look_behind_starts(self.start_look_behind.as_slice()); nfa.set_captures(&self.captures).map_err(BuildError::captures)?; // The idea here is to convert our intermediate states to their final // form. The only real complexity here is the process of converting @@ -706,6 +710,12 @@ impl Builder { self.start_pattern.len() } + /// Adds the `start_id` to the set of starting states that is used when + /// running look-behind expressions. + pub fn start_look_behind(&mut self, start_id: StateID) { + self.start_look_behind.push(start_id); + } + /// Add an "empty" NFA state. /// /// An "empty" NFA state is a state with a single unconditional epsilon diff --git a/regex-automata/src/nfa/thompson/compiler.rs b/regex-automata/src/nfa/thompson/compiler.rs index 7a9393d1e..5a7bccd72 100644 --- a/regex-automata/src/nfa/thompson/compiler.rs +++ b/regex-automata/src/nfa/thompson/compiler.rs @@ -711,11 +711,6 @@ pub struct Compiler { /// State used for caching common suffixes when compiling reverse UTF-8 /// automata (for Unicode character classes). utf8_suffix: RefCell, - /// Top level alternation state which is used to run all look-around - /// assertion checks in lockstep with the main expression. Each look-around - /// expression is compiled to a set of states that is patched into this - /// state, and this state is updated on each new pattern being compiled. - lookaround_alt: RefCell>, /// The next index to use for a look-around expression. lookaround_index: RefCell, } @@ -730,7 +725,6 @@ impl Compiler { utf8_state: RefCell::new(Utf8State::new()), trie_state: RefCell::new(RangeTrie::new()), utf8_suffix: RefCell::new(Utf8SuffixMap::new(1000)), - lookaround_alt: RefCell::new(None), lookaround_index: RefCell::new(SmallIndex::ZERO), } } @@ -993,32 +987,11 @@ impl Compiler { let compiled = self.c_alt_iter(exprs.iter().map(|e| { let _ = self.start_pattern()?; - let has_lookarounds = - (e.borrow() as &Hir).properties().contains_lookaround_expr(); - let mut top_level_alt = if has_lookarounds { - self.add_union()? - } else { - StateID::ZERO - }; - if has_lookarounds { - let lookaround_prefix = - self.c_at_least(&Hir::dot(hir::Dot::AnyByte), false, 0)?; - let lookaround_alt = self.add_union()?; - self.patch(lookaround_prefix.end, lookaround_alt)?; - self.patch(top_level_alt, lookaround_prefix.start)?; - self.lookaround_alt.borrow_mut().replace(lookaround_alt); - } let one = self.c_cap(0, None, e.borrow())?; let match_state_id = self.add_match()?; self.patch(one.end, match_state_id)?; - if has_lookarounds { - self.patch(top_level_alt, one.start)?; - } else { - top_level_alt = one.start; - } - let _ = self.finish_pattern(top_level_alt)?; - self.lookaround_alt.borrow_mut().take(); - Ok(ThompsonRef { start: top_level_alt, end: match_state_id }) + let _ = self.finish_pattern(one.start)?; + Ok(ThompsonRef { start: one.start, end: match_state_id }) }))?; self.patch(unanchored_prefix.end, compiled.start)?; let nfa = self @@ -1052,25 +1025,25 @@ impl Compiler { &self, lookaround: &LookAround, ) -> Result { - let sub = self.c(lookaround.sub())?; - let pos = match lookaround { - LookAround::NegativeLookBehind(_) => false, - LookAround::PositiveLookBehind(_) => true, - }; let idx = *self.lookaround_index.borrow(); *self.lookaround_index.borrow_mut() = SmallIndex::new(idx.one_more()) .map_err(|e| { BuildError::too_many_lookarounds(e.attempted() as usize) })?; + let pos = match lookaround { + LookAround::NegativeLookBehind(_) => false, + LookAround::PositiveLookBehind(_) => true, + }; let check = self.add_check_lookaround(idx, pos)?; + + let unanchored = + self.c_at_least(&Hir::dot(hir::Dot::AnyByte), false, 0)?; + + let sub = self.c(lookaround.sub())?; let write = self.add_write_lookaround(idx)?; + self.patch(unanchored.end, sub.start)?; self.patch(sub.end, write)?; - self.patch( - self.lookaround_alt - .borrow() - .expect("Cannot compile look-around outside pattern"), - sub.start, - )?; + self.builder.borrow_mut().start_look_behind(unanchored.start); Ok(ThompsonRef { start: check, end: check }) } @@ -2169,13 +2142,12 @@ mod tests { &[ s_bin_union(2, 1), s_range(0, 255, 0), - s_bin_union(3, 6), + s_check_lookaround(0, true, 7), s_bin_union(5, 4), s_range(0, 255, 3), - s_look(Look::Start, 7), - s_check_lookaround(0, true, 8), + s_look(Look::Start, 6), s_write_lookaround(0), - s_byte(b'a', 9), + s_byte(b'a', 8), s_match(0) ] ); @@ -2310,11 +2282,10 @@ mod tests { assert_eq!( build(r"(?<=a)").states(), &[ - s_bin_union(1, 4), + s_check_lookaround(0, true, 5), s_bin_union(3, 2), s_range(b'\x00', b'\xFF', 1), - s_byte(b'a', 5), - s_check_lookaround(0, true, 6), + s_byte(b'a', 4), s_write_lookaround(0), s_match(0) ] @@ -2322,16 +2293,16 @@ mod tests { assert_eq!( build(r"(?<=a(? &Vec { + &self.0.start_look_behind + } + // FIXME: The `look_set_prefix_all` computation was not correct, and it // seemed a little tricky to fix it. Since I wasn't actually using it for // anything, I just decided to remove it in the run up to the regex 1.9 @@ -1270,6 +1276,8 @@ pub(super) struct Inner { /// This is needed to initialize the table for storing the result of /// look-around evaluation. lookaround_count: usize, + /// Contains the start states for each of the look-behind subexpressions. + start_look_behind: Vec, /// Heap memory used indirectly by NFA states and other things (like the /// various capturing group representations above). Since each state /// might use a different amount of heap, we need to keep track of this @@ -1419,6 +1427,13 @@ impl Inner { self.start_pattern = start_pattern.to_vec(); } + pub(super) fn set_look_behind_starts( + &mut self, + look_behind_starts: &[StateID], + ) { + self.start_look_behind = look_behind_starts.to_vec(); + } + /// Sets the UTF-8 mode of this NFA. pub(super) fn set_utf8(&mut self, yes: bool) { self.utf8 = yes; @@ -1472,6 +1487,9 @@ impl Inner { for id in self.start_pattern.iter_mut() { *id = old_to_new[*id]; } + for id in self.start_look_behind.iter_mut() { + *id = old_to_new[*id]; + } } } @@ -1483,6 +1501,8 @@ impl fmt::Debug for Inner { '^' } else if sid == self.start_unanchored { '>' + } else if self.start_look_behind.contains(&sid) { + '<' } else { ' ' }; diff --git a/regex-automata/src/nfa/thompson/pikevm.rs b/regex-automata/src/nfa/thompson/pikevm.rs index eb40bf1a9..b3e6e45c9 100644 --- a/regex-automata/src/nfa/thompson/pikevm.rs +++ b/regex-automata/src/nfa/thompson/pikevm.rs @@ -1263,7 +1263,46 @@ impl PikeVM { ref mut curr, ref mut next, ref mut lookaround, + ref mut curr_lookaround, + ref mut next_lookaround, } = cache; + + // This initializes the look-behind threads from the start of the input + // Note: since capture groups are not allowed inside look-behinds, + // there won't be any Capture epsilon transitions and hence it is ok to + // use &mut [] for the slots parameter. We need to add the start states + // in reverse because nested look-behinds have a higher index but must + // be executed first. + for look_behind_start in self.nfa.look_behind_starts() { + self.epsilon_closure( + stack, + &mut [], + curr_lookaround, + lookaround, + input, + 0, + *look_behind_start, + ); + } + + // This brings the look-behind threads into the state they must be for + // starting at input.start() instead of the beginning. This is + // necessary for look-behinds to be able to match outside of the input + // span. + for lb_at in 0..input.start() { + self.nexts( + stack, + curr_lookaround, + next_lookaround, + lookaround, + input, + lb_at, + &mut [], + ); + core::mem::swap(curr_lookaround, next_lookaround); + next_lookaround.set.clear(); + } + let mut hm = None; // Yes, our search doesn't end at input.end(), but includes it. This // is necessary because matches are delayed by one byte, just like @@ -1374,6 +1413,17 @@ impl PikeVM { stack, slots, curr, lookaround, input, at, start_id, ); } + // The look-behind states must be processed first, since their + // result must be available for the processing of the main states. + self.nexts( + stack, + curr_lookaround, + next_lookaround, + lookaround, + input, + at, + &mut [], + ); if let Some(pid) = self.nexts(stack, curr, next, lookaround, input, at, slots) { @@ -1387,7 +1437,9 @@ impl PikeVM { break; } core::mem::swap(curr, next); + core::mem::swap(curr_lookaround, next_lookaround); next.set.clear(); + next_lookaround.set.clear(); at += 1; } instrument!(|c| c.eprint(&self.nfa)); @@ -1442,7 +1494,34 @@ impl PikeVM { ref mut curr, ref mut next, ref mut lookaround, + ref mut curr_lookaround, + ref mut next_lookaround, } = cache; + + for look_behind_start in self.nfa.look_behind_starts() { + self.epsilon_closure( + stack, + &mut [], + curr_lookaround, + lookaround, + input, + 0, + *look_behind_start, + ); + } + for lb_at in 0..input.start() { + self.nexts( + stack, + curr_lookaround, + next_lookaround, + lookaround, + input, + lb_at, + &mut [], + ); + core::mem::swap(curr_lookaround, next_lookaround); + next_lookaround.set.clear(); + } for at in input.start()..=input.end() { let any_matches = !patset.is_empty(); if curr.set.is_empty() { @@ -1459,6 +1538,15 @@ impl PikeVM { stack, slots, curr, lookaround, input, at, start_id, ); } + self.nexts( + stack, + curr_lookaround, + next_lookaround, + lookaround, + input, + at, + &mut [], + ); self.nexts_overlapping( stack, curr, next, lookaround, input, at, patset, ); @@ -1470,7 +1558,9 @@ impl PikeVM { break; } core::mem::swap(curr, next); + core::mem::swap(curr_lookaround, next_lookaround); next.set.clear(); + next_lookaround.set.clear(); } instrument!(|c| c.eprint(&self.nfa)); } @@ -1976,6 +2066,10 @@ pub struct Cache { /// haystack at which look-around indexed x holds and which is <= to the /// current position". lookaround: Vec>, + /// The current active states for look-behind subexpressions. + curr_lookaround: ActiveStates, + /// The next set of states to be explored for look-behind subexpressions. + next_lookaround: ActiveStates, } impl Cache { @@ -1993,6 +2087,8 @@ impl Cache { curr: ActiveStates::new(re), next: ActiveStates::new(re), lookaround: vec![None; re.lookaround_count()], + curr_lookaround: ActiveStates::new(re), + next_lookaround: ActiveStates::new(re), } } @@ -2036,6 +2132,9 @@ impl Cache { pub fn reset(&mut self, re: &PikeVM) { self.curr.reset(re); self.next.reset(re); + self.curr_lookaround.reset(re); + self.next_lookaround.reset(re); + self.lookaround = vec![None; re.lookaround_count()]; } /// Returns the heap memory usage, in bytes, of this cache. @@ -2047,6 +2146,8 @@ impl Cache { (self.stack.len() * size_of::()) + self.curr.memory_usage() + self.next.memory_usage() + + self.curr_lookaround.memory_usage() + + self.next_lookaround.memory_usage() } /// Clears this cache. This should be called at the start of every search @@ -2063,6 +2164,10 @@ impl Cache { self.stack.clear(); self.curr.setup_search(captures_slot_len); self.next.setup_search(captures_slot_len); + // capture groups are not allowed inside look-arounds, so we + // set the slot-length to zero. + self.curr_lookaround.setup_search(0); + self.next_lookaround.setup_search(0); } } diff --git a/testdata/lookaround.toml b/testdata/lookaround.toml index 8818a8f1a..14a303d7c 100644 --- a/testdata/lookaround.toml +++ b/testdata/lookaround.toml @@ -46,6 +46,18 @@ regex = "(?<=c[def]+(?