Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
68 commits
Select commit Hold shift + click to select a range
9f1a5e1
Add lookaround expressions to HIR
Multimodcrafter Mar 5, 2025
e7c2584
Change how flatten works on hir::Lookaround
shilangyu Mar 8, 2025
96548c1
Add hir::Lookaround to the visitor
shilangyu Mar 8, 2025
23764ce
Fix hir::Lookaround printing and add test
shilangyu Mar 8, 2025
fc48116
Remove useless ref
shilangyu Mar 8, 2025
258d36c
Add missing drop case for hir::Lookaround
shilangyu Mar 8, 2025
07be9ab
Rename Lookaround to LookAround
shilangyu Mar 8, 2025
d89e90a
Fix properties of LookArounds
shilangyu Mar 8, 2025
31b09a5
Add missing literal lookaround test
shilangyu Mar 8, 2025
59ccaa1
Fix literal test and useless property computation
shilangyu Mar 11, 2025
2c2c6bd
Adjust parsing errors for lookarounds
shilangyu Mar 8, 2025
dd0ce23
Add LookAround to Ast
shilangyu Mar 8, 2025
ffdb942
Disable failing tests
shilangyu Mar 8, 2025
3d31ff7
Fix UnsupportedCaptureInLookBehind typo
shilangyu Mar 8, 2025
76e11e6
Add unclosed lookaround error
shilangyu Mar 9, 2025
830cabe
Rename lookaround to look-around
shilangyu Mar 9, 2025
c353b35
Support parsing of look-behinds
shilangyu Mar 9, 2025
71c1a90
Reject lookbehinds with capture groups
shilangyu Mar 9, 2025
8352a9e
Add tests for parsing lookbehinds
shilangyu Mar 9, 2025
90828a2
Add AST -> HIR translation for lookarounds
shilangyu Mar 9, 2025
d35e65a
Fix typo
shilangyu Mar 9, 2025
ae33591
Allow for non-capturing groups in lookbehinds
shilangyu Mar 11, 2025
c1f7bb1
Fix missing LookAround in regex-cli
shilangyu Mar 11, 2025
cd070f8
Detect capture groups in lookarounds for cheaper
shilangyu Mar 11, 2025
bdc9bbd
Remove accidental import
shilangyu Mar 11, 2025
edba197
Add new instructions to NFA
Multimodcrafter Mar 6, 2025
f97aa92
Implement lookaround compilation
Multimodcrafter Mar 6, 2025
ccdab18
Restore compilation behaviour for regexes without lookarounds
Multimodcrafter Mar 11, 2025
519d13d
Address review comments
Multimodcrafter Mar 11, 2025
cccfc23
Implement look-around index generation
Multimodcrafter Mar 11, 2025
9c33cca
Change tracking of look-around state to index
Multimodcrafter Mar 11, 2025
e94394c
Fix cli tool and AST->HIR translation
Multimodcrafter Mar 13, 2025
273767d
Fix lookaround union order
Multimodcrafter Mar 13, 2025
170dbb4
Address review comments
Multimodcrafter Mar 18, 2025
c9814a3
Fix look-around indexing
Multimodcrafter Mar 18, 2025
d98cfbf
Add error messages and fix pre-filter
Multimodcrafter Mar 18, 2025
5284761
Add unit tests for look-behind assertions
Multimodcrafter Mar 18, 2025
85c555c
Bump version numbers
Multimodcrafter Mar 25, 2025
cff77ec
Adjust some docs
shilangyu Apr 2, 2025
4a87b58
Add lookbehind with capture group test
shilangyu Apr 3, 2025
21166e5
Change how test suite filters tests
shilangyu Apr 3, 2025
6192349
Change engine fallbacks
shilangyu Apr 4, 2025
07668e6
Rename lookaround_index
shilangyu Apr 4, 2025
9375900
Fix literals tests
shilangyu Apr 4, 2025
b12c09c
Fix anchors in lookarounds
shilangyu Apr 5, 2025
9c7c558
Fix broken doc link
Multimodcrafter Apr 10, 2025
0917a1d
Remove unneeded if condition
shilangyu Apr 17, 2025
88df919
Explain use of empty look-set
Multimodcrafter May 1, 2025
1e13645
Add regression tests
Multimodcrafter Apr 16, 2025
c2cebbb
Change compilation to disconnected components
Multimodcrafter Apr 16, 2025
5cc52ea
Implement look-behind state processing
Multimodcrafter Apr 17, 2025
1ec885a
Show look-behind starts in nfa debug print
Multimodcrafter Apr 17, 2025
df0ebca
Fix doc-link
Multimodcrafter Apr 17, 2025
7b9e339
Fix memory usage calculation
Multimodcrafter Apr 23, 2025
985a662
Fix spelling
Multimodcrafter May 1, 2025
be4a978
Implement matchall performance improvement
Multimodcrafter Apr 23, 2025
80f2607
Implement matchall speedup for meta-engine
Multimodcrafter Apr 23, 2025
869cf0c
Replace catchall with explicit ignore
Multimodcrafter May 1, 2025
0ce8356
Rephrase doc and fix lb start state order
Multimodcrafter May 1, 2025
aea8fa3
Disable lookaround scanning when none present
Multimodcrafter May 1, 2025
e35c8d9
Fast forward look-around threads upon prefiltering
shilangyu May 7, 2025
c43efb0
Add small test for prefiltered regex with lookbehind
shilangyu May 7, 2025
3abfcfd
Change literal extraction for look-arounds
shilangyu May 8, 2025
1f5c5c1
Update wrong doc
shilangyu May 8, 2025
cbc452e
Fix literal extraction tests
shilangyu May 8, 2025
9673c1a
Reverse look_behind_starts
shilangyu May 8, 2025
d5e7dc3
Fix NFA memory usage and typo
Multimodcrafter May 20, 2025
3d13971
Fix lookaround index initialization
Multimodcrafter Jun 12, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "regex"
version = "1.11.2" #:version
version = "1.12.0" #:version
authors = ["The Rust Project Developers", "Andrew Gallant <[email protected]>"]
license = "MIT OR Apache-2.0"
readme = "README.md"
Expand Down Expand Up @@ -176,14 +176,14 @@ default-features = false
# For the actual regex engines.
[dependencies.regex-automata]
path = "regex-automata"
version = "0.4.8"
version = "0.5.0"
default-features = false
features = ["alloc", "syntax", "meta", "nfa-pikevm"]

# For parsing regular expressions.
[dependencies.regex-syntax]
path = "regex-syntax"
version = "0.8.5"
version = "0.9.0"
default-features = false

[dev-dependencies]
Expand Down
4 changes: 2 additions & 2 deletions regex-automata/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "regex-automata"
version = "0.4.10" #:version
version = "0.5.0" #:version
authors = ["The Rust Project Developers", "Andrew Gallant <[email protected]>"]
description = "Automata construction and matching using regular expressions."
documentation = "https://docs.rs/regex-automata"
Expand Down Expand Up @@ -87,7 +87,7 @@ internal-instrument-pikevm = ["logging", "std"]
aho-corasick = { version = "1.0.0", optional = true, default-features = false }
log = { version = "0.4.14", optional = true }
memchr = { version = "2.6.0", optional = true, default-features = false }
regex-syntax = { path = "../regex-syntax", version = "0.8.5", optional = true, default-features = false }
regex-syntax = { path = "../regex-syntax", version = "0.9.0", optional = true, default-features = false }

[dev-dependencies]
anyhow = "1.0.69"
Expand Down
6 changes: 6 additions & 0 deletions regex-automata/src/dfa/dense.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5096,6 +5096,12 @@ impl BuildError {
BuildError { kind: BuildErrorKind::Unsupported(msg) }
}

pub(crate) fn unsupported_lookaround() -> BuildError {
let msg = "cannot build DFAs for regexes with look-around \
sub-expressions; use a different regex engine";
BuildError { kind: BuildErrorKind::Unsupported(msg) }
}

pub(crate) fn too_many_states() -> BuildError {
BuildError { kind: BuildErrorKind::TooManyStates }
}
Expand Down
4 changes: 4 additions & 0 deletions regex-automata/src/dfa/determinize.rs
Original file line number Diff line number Diff line change
Expand Up @@ -219,6 +219,10 @@ impl<'a> Runner<'a> {
return Err(BuildError::unsupported_dfa_word_boundary_unicode());
}

if self.nfa.lookaround_count() > 0 {
return Err(BuildError::unsupported_lookaround());
}

// A sequence of "representative" bytes drawn from each equivalence
// class. These representative bytes are fed to the NFA to compute
// state transitions. This allows us to avoid re-computing state
Expand Down
15 changes: 15 additions & 0 deletions regex-automata/src/dfa/onepass.rs
Original file line number Diff line number Diff line change
Expand Up @@ -602,6 +602,9 @@ impl<'a> InternalBuilder<'a> {
));
}
assert_eq!(DEAD, self.add_empty_state()?);
if self.nfa.lookaround_count() > 0 {
return Err(BuildError::unsupported_lookaround());
}

// This is where the explicit slots start. We care about this because
// we only need to track explicit slots. The implicit slots---two for
Expand Down Expand Up @@ -638,6 +641,10 @@ impl<'a> InternalBuilder<'a> {
self.stack_push(nfa_id, Epsilons::empty())?;
while let Some((id, epsilons)) = self.stack.pop() {
match *self.nfa.state(id) {
thompson::State::WriteLookAround { .. }
| thompson::State::CheckLookAround { .. } => {
return Err(BuildError::unsupported_lookaround());
}
thompson::State::ByteRange { ref trans } => {
self.compile_transition(dfa_id, trans, epsilons)?;
}
Expand Down Expand Up @@ -2996,6 +3003,7 @@ enum BuildErrorKind {
UnsupportedLook { look: Look },
ExceededSizeLimit { limit: usize },
NotOnePass { msg: &'static str },
UnsupportedLookAround,
}

impl BuildError {
Expand Down Expand Up @@ -3026,6 +3034,10 @@ impl BuildError {
fn not_one_pass(msg: &'static str) -> BuildError {
BuildError { kind: BuildErrorKind::NotOnePass { msg } }
}

fn unsupported_lookaround() -> BuildError {
BuildError { kind: BuildErrorKind::UnsupportedLookAround }
}
}

#[cfg(feature = "std")]
Expand Down Expand Up @@ -3074,6 +3086,9 @@ impl core::fmt::Display for BuildError {
pattern is not one-pass: {}",
msg,
),
UnsupportedLookAround => {
write!(f, "one-pass DFA does not support look-arounds")
}
}
}
}
Expand Down
3 changes: 3 additions & 0 deletions regex-automata/src/hybrid/dfa.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4056,6 +4056,9 @@ impl Builder {
&self,
nfa: thompson::NFA,
) -> Result<DFA, BuildError> {
if nfa.lookaround_count() > 0 {
return Err(BuildError::unsupported_lookaround());
}
let quitset = self.config.quit_set_from_nfa(&nfa)?;
let classes = self.config.byte_classes_from_nfa(&nfa, &quitset);
// Check that we can fit at least a few states into our cache,
Expand Down
6 changes: 6 additions & 0 deletions regex-automata/src/hybrid/error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,12 @@ impl BuildError {
different regex engine";
BuildError { kind: BuildErrorKind::Unsupported(msg) }
}

pub(crate) fn unsupported_lookaround() -> BuildError {
let msg = "cannot build DFAs for regexes with look-around \
sub-expressions; use a different regex engine";
BuildError { kind: BuildErrorKind::Unsupported(msg) }
}
}

#[cfg(feature = "std")]
Expand Down
26 changes: 23 additions & 3 deletions regex-automata/src/meta/regex.rs
Original file line number Diff line number Diff line change
Expand Up @@ -611,7 +611,8 @@ impl Regex {
&'r self,
input: I,
) -> FindMatches<'r, 'h> {
let cache = self.pool.get();
let mut cache = self.pool.get();
cache.keep_lookaround_state(true);
let it = iter::Searcher::new(input.into());
FindMatches { re: self, cache, it }
}
Expand Down Expand Up @@ -652,7 +653,8 @@ impl Regex {
&'r self,
input: I,
) -> CapturesMatches<'r, 'h> {
let cache = self.pool.get();
let mut cache = self.pool.get();
cache.keep_lookaround_state(true);
let caps = self.create_captures();
let it = iter::Searcher::new(input.into());
CapturesMatches { re: self, cache, caps, it }
Expand Down Expand Up @@ -2076,7 +2078,11 @@ impl<'r, 'h> Iterator for FindMatches<'r, 'h> {
#[inline]
fn next(&mut self) -> Option<Match> {
let FindMatches { re, ref mut cache, ref mut it } = *self;
it.advance(|input| Ok(re.search_with(cache, input)))
let result = it.advance(|input| Ok(re.search_with(cache, input)));
if result.is_none() {
cache.keep_lookaround_state(false);
}
result
}

#[inline]
Expand Down Expand Up @@ -2149,6 +2155,7 @@ impl<'r, 'h> Iterator for CapturesMatches<'r, 'h> {
if caps.is_match() {
Some(caps.clone())
} else {
cache.keep_lookaround_state(false);
None
}
}
Expand Down Expand Up @@ -2385,6 +2392,19 @@ impl Cache {
re.imp.strat.reset_cache(self)
}

/// Set this cache to keep the state of look-behind assertions upon a
/// match being found.
///
/// This must only be called with a value of `true` when a new search is
/// started at the end of a previously found match, otherwise the result
/// of any search after this call will most likely be wrong.
///
/// Calling this function with a value of `false` will clear any previously
/// stored look-behind state.
pub fn keep_lookaround_state(&mut self, keep: bool) {
self.pikevm.keep_lookaround_state(keep);
}

/// Returns the heap memory usage, in bytes, of this cache.
///
/// This does **not** include the stack size used up by this cache. To
Expand Down
4 changes: 4 additions & 0 deletions regex-automata/src/meta/reverse_inner.rs
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,7 @@ fn top_concat(mut hir: &Hir) -> Option<Vec<Hir>> {
| HirKind::Literal(_)
| HirKind::Class(_)
| HirKind::Look(_)
| HirKind::LookAround(_)
| HirKind::Repetition(_)
| HirKind::Alternation(_) => return None,
HirKind::Capture(hir::Capture { ref sub, .. }) => sub,
Expand Down Expand Up @@ -206,6 +207,9 @@ fn flatten(hir: &Hir) -> Hir {
HirKind::Literal(hir::Literal(ref x)) => Hir::literal(x.clone()),
HirKind::Class(ref x) => Hir::class(x.clone()),
HirKind::Look(ref x) => Hir::look(x.clone()),
HirKind::LookAround(ref x) => {
Hir::lookaround(x.with(flatten(x.sub())))
}
HirKind::Repetition(ref x) => Hir::repetition(x.with(flatten(&x.sub))),
// This is the interesting case. We just drop the group information
// entirely and use the child HIR itself.
Expand Down
90 changes: 49 additions & 41 deletions regex-automata/src/meta/strategy.rs
Original file line number Diff line number Diff line change
Expand Up @@ -258,6 +258,11 @@ impl Pre<()> {
if !info.props()[0].look_set().is_empty() {
return None;
}
// For a similar reason, we require that it has zero look-around
// expressions.
if info.props()[0].contains_lookaround_expr() {
return None;
}
// Finally, currently, our prefilters are all oriented around
// leftmost-first match semantics, so don't try to use them if the
// caller asked for anything else.
Expand Down Expand Up @@ -490,49 +495,52 @@ impl Core {
// we know we aren't going to use the lazy DFA. So we do a config check
// up front, which is in practice the only way we won't try to use the
// DFA.
let (nfarev, hybrid, dfa) =
if !info.config().get_hybrid() && !info.config().get_dfa() {
(None, wrappers::Hybrid::none(), wrappers::DFA::none())
let (nfarev, hybrid, dfa) = if !info.config().get_hybrid()
&& !info.config().get_dfa()
// With look-arounds, the lazy DFA and dense DFA would fail to build
|| nfa.lookaround_count() > 0
{
(None, wrappers::Hybrid::none(), wrappers::DFA::none())
} else {
// FIXME: Technically, we don't quite yet KNOW that we need
// a reverse NFA. It's possible for the DFAs below to both
// fail to build just based on the forward NFA. In which case,
// building the reverse NFA was totally wasted work. But...
// fixing this requires breaking DFA construction apart into
// two pieces: one for the forward part and another for the
// reverse part. Quite annoying. Making it worse, when building
// both DFAs fails, it's quite likely that the NFA is large and
// that it will take quite some time to build the reverse NFA
// too. So... it's really probably worth it to do this!
let nfarev = thompson::Compiler::new()
// Currently, reverse NFAs don't support capturing groups,
// so we MUST disable them. But even if we didn't have to,
// we would, because nothing in this crate does anything
// useful with capturing groups in reverse. And of course,
// the lazy DFA ignores capturing groups in all cases.
.configure(
thompson_config
.clone()
.which_captures(WhichCaptures::None)
.reverse(true),
)
.build_many_from_hir(hirs)
.map_err(BuildError::nfa)?;
let dfa = if !info.config().get_dfa() {
wrappers::DFA::none()
} else {
// FIXME: Technically, we don't quite yet KNOW that we need
// a reverse NFA. It's possible for the DFAs below to both
// fail to build just based on the forward NFA. In which case,
// building the reverse NFA was totally wasted work. But...
// fixing this requires breaking DFA construction apart into
// two pieces: one for the forward part and another for the
// reverse part. Quite annoying. Making it worse, when building
// both DFAs fails, it's quite likely that the NFA is large and
// that it will take quite some time to build the reverse NFA
// too. So... it's really probably worth it to do this!
let nfarev = thompson::Compiler::new()
// Currently, reverse NFAs don't support capturing groups,
// so we MUST disable them. But even if we didn't have to,
// we would, because nothing in this crate does anything
// useful with capturing groups in reverse. And of course,
// the lazy DFA ignores capturing groups in all cases.
.configure(
thompson_config
.clone()
.which_captures(WhichCaptures::None)
.reverse(true),
)
.build_many_from_hir(hirs)
.map_err(BuildError::nfa)?;
let dfa = if !info.config().get_dfa() {
wrappers::DFA::none()
} else {
wrappers::DFA::new(&info, pre.clone(), &nfa, &nfarev)
};
let hybrid = if !info.config().get_hybrid() {
wrappers::Hybrid::none()
} else if dfa.is_some() {
debug!("skipping lazy DFA because we have a full DFA");
wrappers::Hybrid::none()
} else {
wrappers::Hybrid::new(&info, pre.clone(), &nfa, &nfarev)
};
(Some(nfarev), hybrid, dfa)
wrappers::DFA::new(&info, pre.clone(), &nfa, &nfarev)
};
let hybrid = if !info.config().get_hybrid() {
wrappers::Hybrid::none()
} else if dfa.is_some() {
debug!("skipping lazy DFA because we have a full DFA");
wrappers::Hybrid::none()
} else {
wrappers::Hybrid::new(&info, pre.clone(), &nfa, &nfarev)
};
(Some(nfarev), hybrid, dfa)
};
Ok(Core {
info,
pre,
Expand Down
8 changes: 8 additions & 0 deletions regex-automata/src/meta/wrappers.rs
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,12 @@ impl PikeVMCache {
PikeVMCache(Some(builder.get().0.create_cache()))
}

pub(crate) fn keep_lookaround_state(&mut self, keep: bool) {
if let Some(cache) = self.0.as_mut() {
cache.keep_lookaround_state(keep);
}
}

pub(crate) fn reset(&mut self, builder: &PikeVM) {
self.0.as_mut().unwrap().reset(&builder.get().0);
}
Expand Down Expand Up @@ -204,6 +210,8 @@ impl BoundedBacktrackerEngine {
{
if !info.config().get_backtrack()
|| info.config().get_match_kind() != MatchKind::LeftmostFirst
// TODO: remove once look-around support is added.
|| nfa.lookaround_count() > 0
{
return Ok(None);
}
Expand Down
11 changes: 10 additions & 1 deletion regex-automata/src/nfa/thompson/backtrack.rs
Original file line number Diff line number Diff line change
Expand Up @@ -301,6 +301,9 @@ impl Builder {
nfa: NFA,
) -> Result<BoundedBacktracker, BuildError> {
nfa.look_set_any().available().map_err(BuildError::word)?;
if nfa.lookaround_count() > 0 {
return Err(BuildError::unsupported_lookarounds());
}
Ok(BoundedBacktracker { config: self.config.clone(), nfa })
}

Expand Down Expand Up @@ -1453,7 +1456,7 @@ impl BoundedBacktracker {
/// Execute a "step" in the backtracing algorithm.
///
/// A "step" is somewhat of a misnomer, because this routine keeps going
/// until it either runs out of things to try or fins a match. In the
/// until it either runs out of things to try or finds a match. In the
/// former case, it may have pushed some things on to the backtracking
/// stack, in which case, those will be tried next as part of the
/// 'backtrack' routine above.
Expand Down Expand Up @@ -1519,6 +1522,12 @@ impl BoundedBacktracker {
}
sid = next;
}
State::WriteLookAround { .. }
| State::CheckLookAround { .. } => {
unimplemented!(
"backtracking engine does not support look-arounds"
);
}
State::Union { ref alternates } => {
sid = match alternates.get(0) {
None => return None,
Expand Down
Loading
Loading