diff --git a/regex-automata/src/meta/strategy.rs b/regex-automata/src/meta/strategy.rs index 0ac830b9d..19823b555 100644 --- a/regex-automata/src/meta/strategy.rs +++ b/regex-automata/src/meta/strategy.rs @@ -258,6 +258,11 @@ impl Pre<()> { if !info.props()[0].look_set().is_empty() { return None; } + // For a similar reason, we require that it has zero look-around + // expressions. + if info.props()[0].contains_lookaround_expr() { + return None; + } // Finally, currently, our prefilters are all oriented around // leftmost-first match semantics, so don't try to use them if the // caller asked for anything else. diff --git a/regex-automata/src/nfa/thompson/pikevm.rs b/regex-automata/src/nfa/thompson/pikevm.rs index 813804884..b18101c53 100644 --- a/regex-automata/src/nfa/thompson/pikevm.rs +++ b/regex-automata/src/nfa/thompson/pikevm.rs @@ -1293,24 +1293,16 @@ impl PikeVM { *look_behind_start, ); } - - // This brings the look-behind threads into the state they must be for - // starting at input.start() instead of the beginning. This is - // necessary for lookbehinds to be able to match outside of the input - // span. - for lb_at in 0..input.start() { - self.nexts( - stack, - curr_lookaround, - next_lookaround, - lookaround, - input, - lb_at, - &mut [], - ); - core::mem::swap(curr_lookaround, next_lookaround); - next_lookaround.set.clear(); - } + // This is necessary for look-behinds to be able to match outside of the + // input span. + self.fast_forward_lookbehinds( + Span { start: 0, end: input.start() }, + input, + stack, + curr_lookaround, + next_lookaround, + lookaround, + ); } let mut hm = None; @@ -1352,7 +1344,21 @@ impl PikeVM { let span = Span::from(at..input.end()); match pre.find(input.haystack(), span) { None => break, - Some(ref span) => at = span.start, + Some(ref span) => { + if self.lookaround_count() > 0 { + // We are jumping ahead due to the pre-filter, thus we must bring + // the look-behind threads to the new position. + self.fast_forward_lookbehinds( + Span { start: at, end: span.start }, + input, + stack, + curr_lookaround, + next_lookaround, + lookaround, + ); + } + at = span.start + } } } } @@ -1459,6 +1465,36 @@ impl PikeVM { hm } + /// This brings the look-behind threads into the state they must be for + /// starting at [forward_span.end]. The assumption is that they are currently + /// at [forward_span.start]. + fn fast_forward_lookbehinds( + &self, + forward_span: Span, + input: &Input<'_>, + stack: &mut Vec, + curr_lookaround: &mut ActiveStates, + next_lookaround: &mut ActiveStates, + lookaround: &mut Vec>, + ) { + for lb_at in forward_span.start..forward_span.end { + self.nexts( + stack, + curr_lookaround, + next_lookaround, + lookaround, + input, + lb_at, + // Since capture groups are not allowed inside look-arounds, + // there won't be any Capture epsilon transitions and hence it is ok to + // use &mut [] for the slots parameter. + &mut [], + ); + core::mem::swap(curr_lookaround, next_lookaround); + next_lookaround.set.clear(); + } + } + /// The implementation for the 'which_overlapping_matches' API. Basically, /// we do a single scan through the entire haystack (unless our regex /// or search is anchored) and record every pattern that matched. In @@ -1516,7 +1552,7 @@ impl PikeVM { match_lookaround: _, } = cache; - for look_behind_start in self.nfa.look_behind_starts() { + for look_behind_start in self.nfa.look_behind_starts().iter().rev() { self.epsilon_closure( stack, &mut [], @@ -1527,19 +1563,14 @@ impl PikeVM { *look_behind_start, ); } - for lb_at in 0..input.start() { - self.nexts( - stack, - curr_lookaround, - next_lookaround, - lookaround, - input, - lb_at, - &mut [], - ); - core::mem::swap(curr_lookaround, next_lookaround); - next_lookaround.set.clear(); - } + self.fast_forward_lookbehinds( + Span { start: 0, end: input.start() }, + input, + stack, + curr_lookaround, + next_lookaround, + lookaround, + ); for at in input.start()..=input.end() { let any_matches = !patset.is_empty(); if curr.set.is_empty() { diff --git a/regex-syntax/src/hir/literal.rs b/regex-syntax/src/hir/literal.rs index f419dd70e..e09879d81 100644 --- a/regex-syntax/src/hir/literal.rs +++ b/regex-syntax/src/hir/literal.rs @@ -172,8 +172,9 @@ impl Extractor { use crate::hir::HirKind::*; match *hir.kind() { - Empty | Look(_) => Seq::singleton(self::Literal::exact(vec![])), - LookAround(_) => Seq::infinite(), + Empty | Look(_) | LookAround(_) => { + Seq::singleton(self::Literal::exact(vec![])) + } Literal(hir::Literal(ref bytes)) => { let mut seq = Seq::singleton(self::Literal::exact(bytes.to_vec())); @@ -2456,16 +2457,16 @@ mod tests { #[test] fn lookaround() { - assert_eq!(inexact([I("a")], [I("b")]), e(r"a(?<=qwa)b")); - assert_eq!(inexact([I("a")], [I("b")]), e(r"a(?