From 7954f97d0f56049d13d01ca963951876942a63c1 Mon Sep 17 00:00:00 2001 From: Robin Date: Wed, 16 Apr 2025 11:42:58 +0200 Subject: [PATCH 1/7] Add regression tests --- testdata/lookaround.toml | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/testdata/lookaround.toml b/testdata/lookaround.toml index 8818a8f1a..14a303d7c 100644 --- a/testdata/lookaround.toml +++ b/testdata/lookaround.toml @@ -46,6 +46,18 @@ regex = "(?<=c[def]+(? Date: Wed, 16 Apr 2025 16:52:11 +0200 Subject: [PATCH 2/7] Change compilation to disconnected components --- regex-automata/src/nfa/thompson/builder.rs | 9 +++ regex-automata/src/nfa/thompson/compiler.rs | 79 +++++++-------------- regex-automata/src/nfa/thompson/nfa.rs | 9 +++ 3 files changed, 43 insertions(+), 54 deletions(-) diff --git a/regex-automata/src/nfa/thompson/builder.rs b/regex-automata/src/nfa/thompson/builder.rs index c769fda23..e4b6ff665 100644 --- a/regex-automata/src/nfa/thompson/builder.rs +++ b/regex-automata/src/nfa/thompson/builder.rs @@ -340,6 +340,8 @@ pub struct Builder { /// contains a single regex, then `start_pattern[0]` and `start_anchored` /// are always equivalent. start_pattern: Vec, + /// The starting states for each individual look-behind sub-expression. + start_look_behind: Vec, /// A map from pattern ID to capture group index to name. (If no name /// exists, then a None entry is present. Thus, all capturing groups are /// present in this mapping.) @@ -449,6 +451,7 @@ impl Builder { remap.resize(self.states.len(), StateID::ZERO); nfa.set_starts(start_anchored, start_unanchored, &self.start_pattern); + nfa.set_look_behind_starts(self.start_look_behind.as_slice()); nfa.set_captures(&self.captures).map_err(BuildError::captures)?; // The idea here is to convert our intermediate states to their final // form. The only real complexity here is the process of converting @@ -706,6 +709,12 @@ impl Builder { self.start_pattern.len() } + /// Adds the [`start_id`] to the set of starting states that is used when + /// running look-behind expressions. + pub fn start_look_behind(&mut self, start_id: StateID) { + self.start_look_behind.push(start_id); + } + /// Add an "empty" NFA state. /// /// An "empty" NFA state is a state with a single unconditional epsilon diff --git a/regex-automata/src/nfa/thompson/compiler.rs b/regex-automata/src/nfa/thompson/compiler.rs index 7a9393d1e..5a7bccd72 100644 --- a/regex-automata/src/nfa/thompson/compiler.rs +++ b/regex-automata/src/nfa/thompson/compiler.rs @@ -711,11 +711,6 @@ pub struct Compiler { /// State used for caching common suffixes when compiling reverse UTF-8 /// automata (for Unicode character classes). utf8_suffix: RefCell, - /// Top level alternation state which is used to run all look-around - /// assertion checks in lockstep with the main expression. Each look-around - /// expression is compiled to a set of states that is patched into this - /// state, and this state is updated on each new pattern being compiled. - lookaround_alt: RefCell>, /// The next index to use for a look-around expression. lookaround_index: RefCell, } @@ -730,7 +725,6 @@ impl Compiler { utf8_state: RefCell::new(Utf8State::new()), trie_state: RefCell::new(RangeTrie::new()), utf8_suffix: RefCell::new(Utf8SuffixMap::new(1000)), - lookaround_alt: RefCell::new(None), lookaround_index: RefCell::new(SmallIndex::ZERO), } } @@ -993,32 +987,11 @@ impl Compiler { let compiled = self.c_alt_iter(exprs.iter().map(|e| { let _ = self.start_pattern()?; - let has_lookarounds = - (e.borrow() as &Hir).properties().contains_lookaround_expr(); - let mut top_level_alt = if has_lookarounds { - self.add_union()? - } else { - StateID::ZERO - }; - if has_lookarounds { - let lookaround_prefix = - self.c_at_least(&Hir::dot(hir::Dot::AnyByte), false, 0)?; - let lookaround_alt = self.add_union()?; - self.patch(lookaround_prefix.end, lookaround_alt)?; - self.patch(top_level_alt, lookaround_prefix.start)?; - self.lookaround_alt.borrow_mut().replace(lookaround_alt); - } let one = self.c_cap(0, None, e.borrow())?; let match_state_id = self.add_match()?; self.patch(one.end, match_state_id)?; - if has_lookarounds { - self.patch(top_level_alt, one.start)?; - } else { - top_level_alt = one.start; - } - let _ = self.finish_pattern(top_level_alt)?; - self.lookaround_alt.borrow_mut().take(); - Ok(ThompsonRef { start: top_level_alt, end: match_state_id }) + let _ = self.finish_pattern(one.start)?; + Ok(ThompsonRef { start: one.start, end: match_state_id }) }))?; self.patch(unanchored_prefix.end, compiled.start)?; let nfa = self @@ -1052,25 +1025,25 @@ impl Compiler { &self, lookaround: &LookAround, ) -> Result { - let sub = self.c(lookaround.sub())?; - let pos = match lookaround { - LookAround::NegativeLookBehind(_) => false, - LookAround::PositiveLookBehind(_) => true, - }; let idx = *self.lookaround_index.borrow(); *self.lookaround_index.borrow_mut() = SmallIndex::new(idx.one_more()) .map_err(|e| { BuildError::too_many_lookarounds(e.attempted() as usize) })?; + let pos = match lookaround { + LookAround::NegativeLookBehind(_) => false, + LookAround::PositiveLookBehind(_) => true, + }; let check = self.add_check_lookaround(idx, pos)?; + + let unanchored = + self.c_at_least(&Hir::dot(hir::Dot::AnyByte), false, 0)?; + + let sub = self.c(lookaround.sub())?; let write = self.add_write_lookaround(idx)?; + self.patch(unanchored.end, sub.start)?; self.patch(sub.end, write)?; - self.patch( - self.lookaround_alt - .borrow() - .expect("Cannot compile look-around outside pattern"), - sub.start, - )?; + self.builder.borrow_mut().start_look_behind(unanchored.start); Ok(ThompsonRef { start: check, end: check }) } @@ -2169,13 +2142,12 @@ mod tests { &[ s_bin_union(2, 1), s_range(0, 255, 0), - s_bin_union(3, 6), + s_check_lookaround(0, true, 7), s_bin_union(5, 4), s_range(0, 255, 3), - s_look(Look::Start, 7), - s_check_lookaround(0, true, 8), + s_look(Look::Start, 6), s_write_lookaround(0), - s_byte(b'a', 9), + s_byte(b'a', 8), s_match(0) ] ); @@ -2310,11 +2282,10 @@ mod tests { assert_eq!( build(r"(?<=a)").states(), &[ - s_bin_union(1, 4), + s_check_lookaround(0, true, 5), s_bin_union(3, 2), s_range(b'\x00', b'\xFF', 1), - s_byte(b'a', 5), - s_check_lookaround(0, true, 6), + s_byte(b'a', 4), s_write_lookaround(0), s_match(0) ] @@ -2322,16 +2293,16 @@ mod tests { assert_eq!( build(r"(?<=a(?, /// Heap memory used indirectly by NFA states and other things (like the /// various capturing group representations above). Since each state /// might use a different amount of heap, we need to keep track of this @@ -1419,6 +1421,13 @@ impl Inner { self.start_pattern = start_pattern.to_vec(); } + pub(super) fn set_look_behind_starts( + &mut self, + look_behind_starts: &[StateID], + ) { + self.start_look_behind = look_behind_starts.to_vec(); + } + /// Sets the UTF-8 mode of this NFA. pub(super) fn set_utf8(&mut self, yes: bool) { self.utf8 = yes; From d1130e4cc63b39928706546d90d2cd6a40f560c4 Mon Sep 17 00:00:00 2001 From: Robin Date: Thu, 17 Apr 2025 14:53:49 +0200 Subject: [PATCH 3/7] Implement look-behind state processing --- regex-automata/src/nfa/thompson/builder.rs | 1 + regex-automata/src/nfa/thompson/nfa.rs | 9 ++ regex-automata/src/nfa/thompson/pikevm.rs | 103 +++++++++++++++++++++ 3 files changed, 113 insertions(+) diff --git a/regex-automata/src/nfa/thompson/builder.rs b/regex-automata/src/nfa/thompson/builder.rs index e4b6ff665..e2f8bf2ad 100644 --- a/regex-automata/src/nfa/thompson/builder.rs +++ b/regex-automata/src/nfa/thompson/builder.rs @@ -387,6 +387,7 @@ impl Builder { self.pattern_id = None; self.states.clear(); self.start_pattern.clear(); + self.start_look_behind.clear(); self.captures.clear(); self.memory_states = 0; } diff --git a/regex-automata/src/nfa/thompson/nfa.rs b/regex-automata/src/nfa/thompson/nfa.rs index 219dba657..42904d5f2 100644 --- a/regex-automata/src/nfa/thompson/nfa.rs +++ b/regex-automata/src/nfa/thompson/nfa.rs @@ -1106,6 +1106,12 @@ impl NFA { self.0.lookaround_count } + /// Returns the starting states for initializing look-behind evaluation + #[inline] + pub fn look_behind_starts(&self) -> &Vec { + &self.0.start_look_behind + } + // FIXME: The `look_set_prefix_all` computation was not correct, and it // seemed a little tricky to fix it. Since I wasn't actually using it for // anything, I just decided to remove it in the run up to the regex 1.9 @@ -1481,6 +1487,9 @@ impl Inner { for id in self.start_pattern.iter_mut() { *id = old_to_new[*id]; } + for id in self.start_look_behind.iter_mut() { + *id = old_to_new[*id]; + } } } diff --git a/regex-automata/src/nfa/thompson/pikevm.rs b/regex-automata/src/nfa/thompson/pikevm.rs index eb40bf1a9..746086d08 100644 --- a/regex-automata/src/nfa/thompson/pikevm.rs +++ b/regex-automata/src/nfa/thompson/pikevm.rs @@ -1263,7 +1263,46 @@ impl PikeVM { ref mut curr, ref mut next, ref mut lookaround, + ref mut curr_lookaround, + ref mut next_lookaround, } = cache; + + // This initializes the look-behind threads from the start of the input + // Note: since capture groups are not allowed inside look-behinds, + // there won't be any Capture epsilon transitions and hence it is ok to + // use &mut [] for the slots parameter. We need to add the start states + // in reverse because nested look-behinds have a higher index but must + // be executed first. + for look_behind_start in self.nfa.look_behind_starts() { + self.epsilon_closure( + stack, + &mut [], + curr_lookaround, + lookaround, + input, + 0, + *look_behind_start, + ); + } + + // This brings the look-behind threads into the state they must be for + // starting at input.start() instead of the beginning. This is + // necessary for lookbehinds to be able to match outside of the input + // span. + for lb_at in 0..input.start() { + self.nexts( + stack, + curr_lookaround, + next_lookaround, + lookaround, + input, + lb_at, + &mut [], + ); + core::mem::swap(curr_lookaround, next_lookaround); + next_lookaround.set.clear(); + } + let mut hm = None; // Yes, our search doesn't end at input.end(), but includes it. This // is necessary because matches are delayed by one byte, just like @@ -1374,6 +1413,17 @@ impl PikeVM { stack, slots, curr, lookaround, input, at, start_id, ); } + // The lookbehind states must be processed first, since their + // result must be available for the processing of the main states. + self.nexts( + stack, + curr_lookaround, + next_lookaround, + lookaround, + input, + at, + &mut [], + ); if let Some(pid) = self.nexts(stack, curr, next, lookaround, input, at, slots) { @@ -1387,7 +1437,9 @@ impl PikeVM { break; } core::mem::swap(curr, next); + core::mem::swap(curr_lookaround, next_lookaround); next.set.clear(); + next_lookaround.set.clear(); at += 1; } instrument!(|c| c.eprint(&self.nfa)); @@ -1442,7 +1494,34 @@ impl PikeVM { ref mut curr, ref mut next, ref mut lookaround, + ref mut curr_lookaround, + ref mut next_lookaround, } = cache; + + for look_behind_start in self.nfa.look_behind_starts() { + self.epsilon_closure( + stack, + &mut [], + curr_lookaround, + lookaround, + input, + 0, + *look_behind_start, + ); + } + for lb_at in 0..input.start() { + self.nexts( + stack, + curr_lookaround, + next_lookaround, + lookaround, + input, + lb_at, + &mut [], + ); + core::mem::swap(curr_lookaround, next_lookaround); + next_lookaround.set.clear(); + } for at in input.start()..=input.end() { let any_matches = !patset.is_empty(); if curr.set.is_empty() { @@ -1459,6 +1538,15 @@ impl PikeVM { stack, slots, curr, lookaround, input, at, start_id, ); } + self.nexts( + stack, + curr_lookaround, + next_lookaround, + lookaround, + input, + at, + &mut [], + ); self.nexts_overlapping( stack, curr, next, lookaround, input, at, patset, ); @@ -1470,7 +1558,9 @@ impl PikeVM { break; } core::mem::swap(curr, next); + core::mem::swap(curr_lookaround, next_lookaround); next.set.clear(); + next_lookaround.set.clear(); } instrument!(|c| c.eprint(&self.nfa)); } @@ -1976,6 +2066,10 @@ pub struct Cache { /// haystack at which look-around indexed x holds and which is <= to the /// current position". lookaround: Vec>, + /// The current active states for look-behind subexpressions + curr_lookaround: ActiveStates, + /// The next set of states to be explored for look-behind subexpressions + next_lookaround: ActiveStates, } impl Cache { @@ -1993,6 +2087,8 @@ impl Cache { curr: ActiveStates::new(re), next: ActiveStates::new(re), lookaround: vec![None; re.lookaround_count()], + curr_lookaround: ActiveStates::new(re), + next_lookaround: ActiveStates::new(re), } } @@ -2036,6 +2132,9 @@ impl Cache { pub fn reset(&mut self, re: &PikeVM) { self.curr.reset(re); self.next.reset(re); + self.curr_lookaround.reset(re); + self.next_lookaround.reset(re); + self.lookaround = vec![None; re.lookaround_count()]; } /// Returns the heap memory usage, in bytes, of this cache. @@ -2063,6 +2162,10 @@ impl Cache { self.stack.clear(); self.curr.setup_search(captures_slot_len); self.next.setup_search(captures_slot_len); + // capture groups are not allowed inside look-arounds, so we + // set the slot-length to zero. + self.curr_lookaround.setup_search(0); + self.next_lookaround.setup_search(0); } } From dc84e2ca760990f6df1c8f7a84f4fea31ef3f4b5 Mon Sep 17 00:00:00 2001 From: Robin Date: Thu, 17 Apr 2025 14:56:11 +0200 Subject: [PATCH 4/7] Show look-behind starts in nfa debug print --- regex-automata/src/nfa/thompson/nfa.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/regex-automata/src/nfa/thompson/nfa.rs b/regex-automata/src/nfa/thompson/nfa.rs index 42904d5f2..2ac69c761 100644 --- a/regex-automata/src/nfa/thompson/nfa.rs +++ b/regex-automata/src/nfa/thompson/nfa.rs @@ -1501,6 +1501,8 @@ impl fmt::Debug for Inner { '^' } else if sid == self.start_unanchored { '>' + } else if self.start_look_behind.contains(&sid) { + '<' } else { ' ' }; From 2779f5aa77fa64c682b3bdba69032397f0216ea0 Mon Sep 17 00:00:00 2001 From: Robin Date: Thu, 17 Apr 2025 15:44:24 +0200 Subject: [PATCH 5/7] Fix doc-link --- regex-automata/src/nfa/thompson/builder.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/regex-automata/src/nfa/thompson/builder.rs b/regex-automata/src/nfa/thompson/builder.rs index e2f8bf2ad..4f2f9af79 100644 --- a/regex-automata/src/nfa/thompson/builder.rs +++ b/regex-automata/src/nfa/thompson/builder.rs @@ -710,7 +710,7 @@ impl Builder { self.start_pattern.len() } - /// Adds the [`start_id`] to the set of starting states that is used when + /// Adds the `start_id` to the set of starting states that is used when /// running look-behind expressions. pub fn start_look_behind(&mut self, start_id: StateID) { self.start_look_behind.push(start_id); From 39910478590b49e20644696faf60a0d9a8a019e2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Robin=20H=C3=A4nni?= Date: Wed, 23 Apr 2025 10:27:11 +0200 Subject: [PATCH 6/7] Fix memory usage calculation --- regex-automata/src/nfa/thompson/pikevm.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/regex-automata/src/nfa/thompson/pikevm.rs b/regex-automata/src/nfa/thompson/pikevm.rs index 746086d08..fb92e6e4d 100644 --- a/regex-automata/src/nfa/thompson/pikevm.rs +++ b/regex-automata/src/nfa/thompson/pikevm.rs @@ -2146,6 +2146,8 @@ impl Cache { (self.stack.len() * size_of::()) + self.curr.memory_usage() + self.next.memory_usage() + + self.curr_lookaround.memory_usage() + + self.next_lookaround.memory_usage() } /// Clears this cache. This should be called at the start of every search From cd7ba253145292774e4d26e29a37bcf987b545e1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Robin=20H=C3=A4nni?= Date: Thu, 1 May 2025 11:52:23 +0200 Subject: [PATCH 7/7] Fix spelling --- regex-automata/src/nfa/thompson/nfa.rs | 4 ++-- regex-automata/src/nfa/thompson/pikevm.rs | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/regex-automata/src/nfa/thompson/nfa.rs b/regex-automata/src/nfa/thompson/nfa.rs index 2ac69c761..1d63bd64a 100644 --- a/regex-automata/src/nfa/thompson/nfa.rs +++ b/regex-automata/src/nfa/thompson/nfa.rs @@ -1106,7 +1106,7 @@ impl NFA { self.0.lookaround_count } - /// Returns the starting states for initializing look-behind evaluation + /// Returns the starting states for initializing look-behind evaluation. #[inline] pub fn look_behind_starts(&self) -> &Vec { &self.0.start_look_behind @@ -1276,7 +1276,7 @@ pub(super) struct Inner { /// This is needed to initialize the table for storing the result of /// look-around evaluation. lookaround_count: usize, - /// Contains the start states for each of the look-behind subexpressions + /// Contains the start states for each of the look-behind subexpressions. start_look_behind: Vec, /// Heap memory used indirectly by NFA states and other things (like the /// various capturing group representations above). Since each state diff --git a/regex-automata/src/nfa/thompson/pikevm.rs b/regex-automata/src/nfa/thompson/pikevm.rs index fb92e6e4d..b3e6e45c9 100644 --- a/regex-automata/src/nfa/thompson/pikevm.rs +++ b/regex-automata/src/nfa/thompson/pikevm.rs @@ -1287,7 +1287,7 @@ impl PikeVM { // This brings the look-behind threads into the state they must be for // starting at input.start() instead of the beginning. This is - // necessary for lookbehinds to be able to match outside of the input + // necessary for look-behinds to be able to match outside of the input // span. for lb_at in 0..input.start() { self.nexts( @@ -1413,7 +1413,7 @@ impl PikeVM { stack, slots, curr, lookaround, input, at, start_id, ); } - // The lookbehind states must be processed first, since their + // The look-behind states must be processed first, since their // result must be available for the processing of the main states. self.nexts( stack, @@ -2066,9 +2066,9 @@ pub struct Cache { /// haystack at which look-around indexed x holds and which is <= to the /// current position". lookaround: Vec>, - /// The current active states for look-behind subexpressions + /// The current active states for look-behind subexpressions. curr_lookaround: ActiveStates, - /// The next set of states to be explored for look-behind subexpressions + /// The next set of states to be explored for look-behind subexpressions. next_lookaround: ActiveStates, }