From b8f4d06d1199781573c1a21be88bd114f32caccd Mon Sep 17 00:00:00 2001 From: shilangyu Date: Wed, 7 May 2025 21:40:07 +0200 Subject: [PATCH 1/6] Fast forward look-around threads upon prefiltering --- regex-automata/src/nfa/thompson/pikevm.rs | 95 +++++++++++++++-------- 1 file changed, 63 insertions(+), 32 deletions(-) diff --git a/regex-automata/src/nfa/thompson/pikevm.rs b/regex-automata/src/nfa/thompson/pikevm.rs index 813804884..4763d2dbd 100644 --- a/regex-automata/src/nfa/thompson/pikevm.rs +++ b/regex-automata/src/nfa/thompson/pikevm.rs @@ -1293,24 +1293,16 @@ impl PikeVM { *look_behind_start, ); } - - // This brings the look-behind threads into the state they must be for - // starting at input.start() instead of the beginning. This is - // necessary for lookbehinds to be able to match outside of the input - // span. - for lb_at in 0..input.start() { - self.nexts( - stack, - curr_lookaround, - next_lookaround, - lookaround, - input, - lb_at, - &mut [], - ); - core::mem::swap(curr_lookaround, next_lookaround); - next_lookaround.set.clear(); - } + // This is necessary for look-behinds to be able to match outside of the + // input span. + self.fast_forward_lookbehinds( + Span { start: 0, end: input.start() }, + input, + stack, + curr_lookaround, + next_lookaround, + lookaround, + ); } let mut hm = None; @@ -1352,7 +1344,21 @@ impl PikeVM { let span = Span::from(at..input.end()); match pre.find(input.haystack(), span) { None => break, - Some(ref span) => at = span.start, + Some(ref span) => { + if self.lookaround_count() > 0 { + // We are jumping ahead due to the pre-filter, thus we must bring + // the look-behind threads to the new position. + self.fast_forward_lookbehinds( + Span { start: at, end: span.start }, + input, + stack, + curr_lookaround, + next_lookaround, + lookaround, + ); + } + at = span.start + } } } } @@ -1459,6 +1465,36 @@ impl PikeVM { hm } + /// This brings the look-behind threads into the state they must be for + /// starting at [input.end]. The assumption is that they are currently + /// at [input.start]. + fn fast_forward_lookbehinds( + &self, + forward_span: Span, + input: &Input<'_>, + stack: &mut Vec, + curr_lookaround: &mut ActiveStates, + next_lookaround: &mut ActiveStates, + lookaround: &mut Vec>, + ) { + for lb_at in forward_span.start..forward_span.end { + self.nexts( + stack, + curr_lookaround, + next_lookaround, + lookaround, + input, + lb_at, + // Since capture groups are not allowed inside look-arounds, + // there won't be any Capture epsilon transitions and hence it is ok to + // use &mut [] for the slots parameter. + &mut [], + ); + core::mem::swap(curr_lookaround, next_lookaround); + next_lookaround.set.clear(); + } + } + /// The implementation for the 'which_overlapping_matches' API. Basically, /// we do a single scan through the entire haystack (unless our regex /// or search is anchored) and record every pattern that matched. In @@ -1527,19 +1563,14 @@ impl PikeVM { *look_behind_start, ); } - for lb_at in 0..input.start() { - self.nexts( - stack, - curr_lookaround, - next_lookaround, - lookaround, - input, - lb_at, - &mut [], - ); - core::mem::swap(curr_lookaround, next_lookaround); - next_lookaround.set.clear(); - } + self.fast_forward_lookbehinds( + Span { start: 0, end: input.start() }, + input, + stack, + curr_lookaround, + next_lookaround, + lookaround, + ); for at in input.start()..=input.end() { let any_matches = !patset.is_empty(); if curr.set.is_empty() { From 6d72e9f4b0c05d0fc880142e333c0acc56b63c23 Mon Sep 17 00:00:00 2001 From: shilangyu Date: Wed, 7 May 2025 21:45:58 +0200 Subject: [PATCH 2/6] Add small test for prefiltered regex with lookbehind --- testdata/lookaround.toml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/testdata/lookaround.toml b/testdata/lookaround.toml index 14a303d7c..91fab56a0 100644 --- a/testdata/lookaround.toml +++ b/testdata/lookaround.toml @@ -84,3 +84,9 @@ matches = [ [[1, 3], [1, 2], [2, 3]], [[5, 7], [5, 6], [6, 7]], ] + +[[test]] +name = "lookbehind matching before the prefiltered start position" +regex = "b(?<=ab)" +haystack = "ab" +matches = [[1, 2]] From 9d084813d8d703755ea8fad23791fc0de91cfcd2 Mon Sep 17 00:00:00 2001 From: shilangyu Date: Thu, 8 May 2025 08:03:40 +0200 Subject: [PATCH 3/6] Change literal extraction for look-arounds --- regex-automata/src/meta/strategy.rs | 5 +++++ regex-syntax/src/hir/literal.rs | 5 +++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/regex-automata/src/meta/strategy.rs b/regex-automata/src/meta/strategy.rs index 0ac830b9d..19823b555 100644 --- a/regex-automata/src/meta/strategy.rs +++ b/regex-automata/src/meta/strategy.rs @@ -258,6 +258,11 @@ impl Pre<()> { if !info.props()[0].look_set().is_empty() { return None; } + // For a similar reason, we require that it has zero look-around + // expressions. + if info.props()[0].contains_lookaround_expr() { + return None; + } // Finally, currently, our prefilters are all oriented around // leftmost-first match semantics, so don't try to use them if the // caller asked for anything else. diff --git a/regex-syntax/src/hir/literal.rs b/regex-syntax/src/hir/literal.rs index f419dd70e..84c89c1ee 100644 --- a/regex-syntax/src/hir/literal.rs +++ b/regex-syntax/src/hir/literal.rs @@ -172,8 +172,9 @@ impl Extractor { use crate::hir::HirKind::*; match *hir.kind() { - Empty | Look(_) => Seq::singleton(self::Literal::exact(vec![])), - LookAround(_) => Seq::infinite(), + Empty | Look(_) | LookAround(_) => { + Seq::singleton(self::Literal::exact(vec![])) + } Literal(hir::Literal(ref bytes)) => { let mut seq = Seq::singleton(self::Literal::exact(bytes.to_vec())); From 2c030f011255b23f4d2de7b832b188a996d17571 Mon Sep 17 00:00:00 2001 From: shilangyu Date: Thu, 8 May 2025 10:30:10 +0200 Subject: [PATCH 4/6] Update wrong doc --- regex-automata/src/nfa/thompson/pikevm.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/regex-automata/src/nfa/thompson/pikevm.rs b/regex-automata/src/nfa/thompson/pikevm.rs index 4763d2dbd..d976bfc12 100644 --- a/regex-automata/src/nfa/thompson/pikevm.rs +++ b/regex-automata/src/nfa/thompson/pikevm.rs @@ -1466,8 +1466,8 @@ impl PikeVM { } /// This brings the look-behind threads into the state they must be for - /// starting at [input.end]. The assumption is that they are currently - /// at [input.start]. + /// starting at [forward_span.end]. The assumption is that they are currently + /// at [forward_span.start]. fn fast_forward_lookbehinds( &self, forward_span: Span, From 0bc37d2e12b66ec1a300a4e8070e16344731d5d9 Mon Sep 17 00:00:00 2001 From: shilangyu Date: Thu, 8 May 2025 10:30:22 +0200 Subject: [PATCH 5/6] Fix literal extraction tests --- regex-syntax/src/hir/literal.rs | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/regex-syntax/src/hir/literal.rs b/regex-syntax/src/hir/literal.rs index 84c89c1ee..e09879d81 100644 --- a/regex-syntax/src/hir/literal.rs +++ b/regex-syntax/src/hir/literal.rs @@ -2457,16 +2457,16 @@ mod tests { #[test] fn lookaround() { - assert_eq!(inexact([I("a")], [I("b")]), e(r"a(?<=qwa)b")); - assert_eq!(inexact([I("a")], [I("b")]), e(r"a(? Date: Thu, 8 May 2025 10:31:20 +0200 Subject: [PATCH 6/6] Reverse look_behind_starts --- regex-automata/src/nfa/thompson/pikevm.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/regex-automata/src/nfa/thompson/pikevm.rs b/regex-automata/src/nfa/thompson/pikevm.rs index d976bfc12..b18101c53 100644 --- a/regex-automata/src/nfa/thompson/pikevm.rs +++ b/regex-automata/src/nfa/thompson/pikevm.rs @@ -1552,7 +1552,7 @@ impl PikeVM { match_lookaround: _, } = cache; - for look_behind_start in self.nfa.look_behind_starts() { + for look_behind_start in self.nfa.look_behind_starts().iter().rev() { self.epsilon_closure( stack, &mut [],