Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
45 commits
Select commit Hold shift + click to select a range
6c88105
Add lookaround expressions to HIR
Multimodcrafter Mar 5, 2025
3c54714
Change how flatten works on hir::Lookaround
shilangyu Mar 8, 2025
4eb4286
Add hir::Lookaround to the visitor
shilangyu Mar 8, 2025
bdee6b2
Fix hir::Lookaround printing and add test
shilangyu Mar 8, 2025
033e0eb
Remove useless ref
shilangyu Mar 8, 2025
8a5d895
Add missing drop case for hir::Lookaround
shilangyu Mar 8, 2025
951408c
Rename Lookaround to LookAround
shilangyu Mar 8, 2025
f01c5a2
Fix properties of LookArounds
shilangyu Mar 8, 2025
f14c47c
Add missing literal lookaround test
shilangyu Mar 8, 2025
0c28299
Fix literal test and useless property computation
shilangyu Mar 11, 2025
e597c99
Adjust parsing errors for lookarounds
shilangyu Mar 8, 2025
353c515
Add LookAround to Ast
shilangyu Mar 8, 2025
b1c7e5d
Disable failing tests
shilangyu Mar 8, 2025
fb8f1eb
Fix UnsupportedCaptureInLookBehind typo
shilangyu Mar 8, 2025
06139a4
Add unclosed lookaround error
shilangyu Mar 9, 2025
145ec42
Rename lookaround to look-around
shilangyu Mar 9, 2025
6af9719
Support parsing of look-behinds
shilangyu Mar 9, 2025
10a51fc
Reject lookbehinds with capture groups
shilangyu Mar 9, 2025
9fedb23
Add tests for parsing lookbehinds
shilangyu Mar 9, 2025
9e91469
Add AST -> HIR translation for lookarounds
shilangyu Mar 9, 2025
ab34194
Fix typo
shilangyu Mar 9, 2025
a3a0f05
Allow for non-capturing groups in lookbehinds
shilangyu Mar 11, 2025
959a8d2
Fix missing LookAround in regex-cli
shilangyu Mar 11, 2025
a133b6f
Detect capture groups in lookarounds for cheaper
shilangyu Mar 11, 2025
f87b5c0
Remove accidental import
shilangyu Mar 11, 2025
c670d15
Add new instructions to NFA
Multimodcrafter Mar 6, 2025
59f9d03
Implement lookaround compilation
Multimodcrafter Mar 6, 2025
bb375f5
Restore compilation behaviour for regexes without lookarounds
Multimodcrafter Mar 11, 2025
27e90fa
Address review comments
Multimodcrafter Mar 11, 2025
c126820
Implement look-around index generation
Multimodcrafter Mar 11, 2025
bfd8087
Change tracking of look-around state to index
Multimodcrafter Mar 11, 2025
183da7a
Fix cli tool and AST->HIR translation
Multimodcrafter Mar 13, 2025
eafed1e
Fix lookaround union order
Multimodcrafter Mar 13, 2025
21cef5e
Address review comments
Multimodcrafter Mar 18, 2025
d435d2a
Fix look-around indexing
Multimodcrafter Mar 18, 2025
ee10459
Add error messages and fix pre-filter
Multimodcrafter Mar 18, 2025
0b51fc5
Add unit tests for look-behind assertions
Multimodcrafter Mar 18, 2025
8c4beee
Bump version numbers
Multimodcrafter Mar 25, 2025
289506c
Adjust some docs
shilangyu Apr 2, 2025
65223a7
Add lookbehind with capture group test
shilangyu Apr 3, 2025
051c1bb
Change how test suite filters tests
shilangyu Apr 3, 2025
73eae18
Change engine fallbacks
shilangyu Apr 4, 2025
ec0c4bf
Rename lookaround_index
shilangyu Apr 4, 2025
fd83509
Fix literals tests
shilangyu Apr 4, 2025
bdceca5
Fix anchors in lookarounds
shilangyu Apr 5, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "regex"
version = "1.11.1" #:version
version = "1.12.0" #:version
authors = ["The Rust Project Developers", "Andrew Gallant <[email protected]>"]
license = "MIT OR Apache-2.0"
readme = "README.md"
Expand Down Expand Up @@ -176,14 +176,14 @@ default-features = false
# For the actual regex engines.
[dependencies.regex-automata]
path = "regex-automata"
version = "0.4.8"
version = "0.5.0"
default-features = false
features = ["alloc", "syntax", "meta", "nfa-pikevm"]

# For parsing regular expressions.
[dependencies.regex-syntax]
path = "regex-syntax"
version = "0.8.5"
version = "0.9.0"
default-features = false

[dev-dependencies]
Expand Down
4 changes: 2 additions & 2 deletions regex-automata/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "regex-automata"
version = "0.4.9" #:version
version = "0.5.0" #:version
authors = ["The Rust Project Developers", "Andrew Gallant <[email protected]>"]
description = "Automata construction and matching using regular expressions."
documentation = "https://docs.rs/regex-automata"
Expand Down Expand Up @@ -86,7 +86,7 @@ internal-instrument-pikevm = ["logging", "std"]
aho-corasick = { version = "1.0.0", optional = true, default-features = false }
log = { version = "0.4.14", optional = true }
memchr = { version = "2.6.0", optional = true, default-features = false }
regex-syntax = { path = "../regex-syntax", version = "0.8.5", optional = true, default-features = false }
regex-syntax = { path = "../regex-syntax", version = "0.9.0", optional = true, default-features = false }

[dev-dependencies]
anyhow = "1.0.69"
Expand Down
6 changes: 6 additions & 0 deletions regex-automata/src/dfa/dense.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5083,6 +5083,12 @@ impl BuildError {
BuildError { kind: BuildErrorKind::Unsupported(msg) }
}

pub(crate) fn unsupported_lookaround() -> BuildError {
let msg = "cannot build DFAs for regexes with look-around \
sub-expressions; use a different regex engine";
BuildError { kind: BuildErrorKind::Unsupported(msg) }
}

pub(crate) fn too_many_states() -> BuildError {
BuildError { kind: BuildErrorKind::TooManyStates }
}
Expand Down
4 changes: 4 additions & 0 deletions regex-automata/src/dfa/determinize.rs
Original file line number Diff line number Diff line change
Expand Up @@ -219,6 +219,10 @@ impl<'a> Runner<'a> {
return Err(BuildError::unsupported_dfa_word_boundary_unicode());
}

if self.nfa.lookaround_count() > 0 {
return Err(BuildError::unsupported_lookaround());
}

// A sequence of "representative" bytes drawn from each equivalence
// class. These representative bytes are fed to the NFA to compute
// state transitions. This allows us to avoid re-computing state
Expand Down
15 changes: 15 additions & 0 deletions regex-automata/src/dfa/onepass.rs
Original file line number Diff line number Diff line change
Expand Up @@ -602,6 +602,9 @@ impl<'a> InternalBuilder<'a> {
));
}
assert_eq!(DEAD, self.add_empty_state()?);
if self.nfa.lookaround_count() > 0 {
return Err(BuildError::unsupported_lookaround());
}

// This is where the explicit slots start. We care about this because
// we only need to track explicit slots. The implicit slots---two for
Expand Down Expand Up @@ -638,6 +641,10 @@ impl<'a> InternalBuilder<'a> {
self.stack_push(nfa_id, Epsilons::empty())?;
while let Some((id, epsilons)) = self.stack.pop() {
match *self.nfa.state(id) {
thompson::State::WriteLookAround { .. }
| thompson::State::CheckLookAround { .. } => {
return Err(BuildError::unsupported_lookaround());
}
thompson::State::ByteRange { ref trans } => {
self.compile_transition(dfa_id, trans, epsilons)?;
}
Expand Down Expand Up @@ -2996,6 +3003,7 @@ enum BuildErrorKind {
UnsupportedLook { look: Look },
ExceededSizeLimit { limit: usize },
NotOnePass { msg: &'static str },
UnsupportedLookAround,
}

impl BuildError {
Expand Down Expand Up @@ -3026,6 +3034,10 @@ impl BuildError {
fn not_one_pass(msg: &'static str) -> BuildError {
BuildError { kind: BuildErrorKind::NotOnePass { msg } }
}

fn unsupported_lookaround() -> BuildError {
BuildError { kind: BuildErrorKind::UnsupportedLookAround }
}
}

#[cfg(feature = "std")]
Expand Down Expand Up @@ -3074,6 +3086,9 @@ impl core::fmt::Display for BuildError {
pattern is not one-pass: {}",
msg,
),
UnsupportedLookAround => {
write!(f, "one-pass DFA does not support look-arounds")
}
}
}
}
Expand Down
3 changes: 3 additions & 0 deletions regex-automata/src/hybrid/dfa.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4056,6 +4056,9 @@ impl Builder {
&self,
nfa: thompson::NFA,
) -> Result<DFA, BuildError> {
if nfa.lookaround_count() > 0 {
return Err(BuildError::unsupported_lookaround());
}
let quitset = self.config.quit_set_from_nfa(&nfa)?;
let classes = self.config.byte_classes_from_nfa(&nfa, &quitset);
// Check that we can fit at least a few states into our cache,
Expand Down
6 changes: 6 additions & 0 deletions regex-automata/src/hybrid/error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,12 @@ impl BuildError {
different regex engine";
BuildError { kind: BuildErrorKind::Unsupported(msg) }
}

pub(crate) fn unsupported_lookaround() -> BuildError {
let msg = "cannot build DFAs for regexes with look-around \
sub-expressions; use a different regex engine";
BuildError { kind: BuildErrorKind::Unsupported(msg) }
}
}

#[cfg(feature = "std")]
Expand Down
4 changes: 4 additions & 0 deletions regex-automata/src/meta/reverse_inner.rs
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,7 @@ fn top_concat(mut hir: &Hir) -> Option<Vec<Hir>> {
| HirKind::Literal(_)
| HirKind::Class(_)
| HirKind::Look(_)
| HirKind::LookAround(_)
| HirKind::Repetition(_)
| HirKind::Alternation(_) => return None,
HirKind::Capture(hir::Capture { ref sub, .. }) => sub,
Expand Down Expand Up @@ -206,6 +207,9 @@ fn flatten(hir: &Hir) -> Hir {
HirKind::Literal(hir::Literal(ref x)) => Hir::literal(x.clone()),
HirKind::Class(ref x) => Hir::class(x.clone()),
HirKind::Look(ref x) => Hir::look(x.clone()),
HirKind::LookAround(ref x) => {
Hir::lookaround(x.with(flatten(x.sub())))
}
HirKind::Repetition(ref x) => Hir::repetition(x.with(flatten(&x.sub))),
// This is the interesting case. We just drop the group information
// entirely and use the child HIR itself.
Expand Down
85 changes: 44 additions & 41 deletions regex-automata/src/meta/strategy.rs
Original file line number Diff line number Diff line change
Expand Up @@ -490,49 +490,52 @@ impl Core {
// we know we aren't going to use the lazy DFA. So we do a config check
// up front, which is in practice the only way we won't try to use the
// DFA.
let (nfarev, hybrid, dfa) =
if !info.config().get_hybrid() && !info.config().get_dfa() {
(None, wrappers::Hybrid::none(), wrappers::DFA::none())
let (nfarev, hybrid, dfa) = if !info.config().get_hybrid()
&& !info.config().get_dfa()
// With look-arounds, the lazy DFA and dense DFA would fail to build
|| nfa.lookaround_count() > 0
{
(None, wrappers::Hybrid::none(), wrappers::DFA::none())
} else {
// FIXME: Technically, we don't quite yet KNOW that we need
// a reverse NFA. It's possible for the DFAs below to both
// fail to build just based on the forward NFA. In which case,
// building the reverse NFA was totally wasted work. But...
// fixing this requires breaking DFA construction apart into
// two pieces: one for the forward part and another for the
// reverse part. Quite annoying. Making it worse, when building
// both DFAs fails, it's quite likely that the NFA is large and
// that it will take quite some time to build the reverse NFA
// too. So... it's really probably worth it to do this!
let nfarev = thompson::Compiler::new()
// Currently, reverse NFAs don't support capturing groups,
// so we MUST disable them. But even if we didn't have to,
// we would, because nothing in this crate does anything
// useful with capturing groups in reverse. And of course,
// the lazy DFA ignores capturing groups in all cases.
.configure(
thompson_config
.clone()
.which_captures(WhichCaptures::None)
.reverse(true),
)
.build_many_from_hir(hirs)
.map_err(BuildError::nfa)?;
let dfa = if !info.config().get_dfa() {
wrappers::DFA::none()
} else {
// FIXME: Technically, we don't quite yet KNOW that we need
// a reverse NFA. It's possible for the DFAs below to both
// fail to build just based on the forward NFA. In which case,
// building the reverse NFA was totally wasted work. But...
// fixing this requires breaking DFA construction apart into
// two pieces: one for the forward part and another for the
// reverse part. Quite annoying. Making it worse, when building
// both DFAs fails, it's quite likely that the NFA is large and
// that it will take quite some time to build the reverse NFA
// too. So... it's really probably worth it to do this!
let nfarev = thompson::Compiler::new()
// Currently, reverse NFAs don't support capturing groups,
// so we MUST disable them. But even if we didn't have to,
// we would, because nothing in this crate does anything
// useful with capturing groups in reverse. And of course,
// the lazy DFA ignores capturing groups in all cases.
.configure(
thompson_config
.clone()
.which_captures(WhichCaptures::None)
.reverse(true),
)
.build_many_from_hir(hirs)
.map_err(BuildError::nfa)?;
let dfa = if !info.config().get_dfa() {
wrappers::DFA::none()
} else {
wrappers::DFA::new(&info, pre.clone(), &nfa, &nfarev)
};
let hybrid = if !info.config().get_hybrid() {
wrappers::Hybrid::none()
} else if dfa.is_some() {
debug!("skipping lazy DFA because we have a full DFA");
wrappers::Hybrid::none()
} else {
wrappers::Hybrid::new(&info, pre.clone(), &nfa, &nfarev)
};
(Some(nfarev), hybrid, dfa)
wrappers::DFA::new(&info, pre.clone(), &nfa, &nfarev)
};
let hybrid = if !info.config().get_hybrid() {
wrappers::Hybrid::none()
} else if dfa.is_some() {
debug!("skipping lazy DFA because we have a full DFA");
wrappers::Hybrid::none()
} else {
wrappers::Hybrid::new(&info, pre.clone(), &nfa, &nfarev)
};
(Some(nfarev), hybrid, dfa)
};
Ok(Core {
info,
pre,
Expand Down
2 changes: 2 additions & 0 deletions regex-automata/src/meta/wrappers.rs
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,8 @@ impl BoundedBacktrackerEngine {
{
if !info.config().get_backtrack()
|| info.config().get_match_kind() != MatchKind::LeftmostFirst
// TODO: remove once look-around support is added.
|| nfa.lookaround_count() > 0
{
return Ok(None);
}
Expand Down
11 changes: 10 additions & 1 deletion regex-automata/src/nfa/thompson/backtrack.rs
Original file line number Diff line number Diff line change
Expand Up @@ -301,6 +301,9 @@ impl Builder {
nfa: NFA,
) -> Result<BoundedBacktracker, BuildError> {
nfa.look_set_any().available().map_err(BuildError::word)?;
if nfa.lookaround_count() > 0 {
return Err(BuildError::unsupported_lookarounds());
}
Ok(BoundedBacktracker { config: self.config.clone(), nfa })
}

Expand Down Expand Up @@ -1453,7 +1456,7 @@ impl BoundedBacktracker {
/// Execute a "step" in the backtracing algorithm.
///
/// A "step" is somewhat of a misnomer, because this routine keeps going
/// until it either runs out of things to try or fins a match. In the
/// until it either runs out of things to try or finds a match. In the
/// former case, it may have pushed some things on to the backtracking
/// stack, in which case, those will be tried next as part of the
/// 'backtrack' routine above.
Expand Down Expand Up @@ -1519,6 +1522,12 @@ impl BoundedBacktracker {
}
sid = next;
}
State::WriteLookAround { .. }
| State::CheckLookAround { .. } => {
unimplemented!(
"backtracking engine does not support look-arounds"
);
}
State::Union { ref alternates } => {
sid = match alternates.get(0) {
None => return None,
Expand Down
58 changes: 57 additions & 1 deletion regex-automata/src/nfa/thompson/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,17 @@ enum State {
/// The next state that this state should transition to.
next: StateID,
},
/// An empty state that behaves analogously to a `Match` state but for
/// the look-around sub-expression with the given look-around index.
WriteLookAround { lookaround_index: SmallIndex },
/// A conditional epsilon transition that will only be taken if the
/// look-around sub-expression with the given index evaluates to `positive`
/// at the current position in the haystack.
CheckLookAround {
lookaround_index: SmallIndex,
positive: bool,
next: StateID,
},
/// An alternation such that there exists an epsilon transition to all
/// states in `alternates`, where matches found via earlier transitions
/// are preferred over later transitions.
Expand Down Expand Up @@ -154,7 +165,9 @@ impl State {
| State::CaptureStart { .. }
| State::CaptureEnd { .. }
| State::Fail
| State::Match { .. } => 0,
| State::Match { .. }
| State::CheckLookAround { .. }
| State::WriteLookAround { .. } => 0,
State::Sparse { ref transitions } => {
transitions.len() * mem::size_of::<Transition>()
}
Expand Down Expand Up @@ -470,6 +483,21 @@ impl Builder {
State::Look { look, next } => {
remap[sid] = nfa.add(nfa::State::Look { look, next });
}
State::WriteLookAround { lookaround_index } => {
remap[sid] = nfa
.add(nfa::State::WriteLookAround { lookaround_index });
}
State::CheckLookAround {
lookaround_index,
positive,
next,
} => {
remap[sid] = nfa.add(nfa::State::CheckLookAround {
lookaround_index,
positive,
next,
});
}
State::CaptureStart { pattern_id, group_index, next } => {
// We can't remove this empty state because of the side
// effect of capturing an offset for this capture slot.
Expand Down Expand Up @@ -693,6 +721,30 @@ impl Builder {
self.add(State::Empty { next: StateID::ZERO })
}

/// Add a state which will record that the look-around with the given index
/// is satisfied at the current position.
pub fn add_write_lookaround(
&mut self,
index: SmallIndex,
) -> Result<StateID, BuildError> {
self.add(State::WriteLookAround { lookaround_index: index })
}

/// Add a state which will check whether the look-around with the given
/// index is satisfied at the current position.
pub fn add_check_lookaround(
&mut self,
index: SmallIndex,
positive: bool,
next: StateID,
) -> Result<StateID, BuildError> {
self.add(State::CheckLookAround {
lookaround_index: index,
positive,
next,
})
}

/// Add a "union" NFA state.
///
/// A "union" NFA state that contains zero or more unconditional epsilon
Expand Down Expand Up @@ -1159,6 +1211,9 @@ impl Builder {
State::Look { ref mut next, .. } => {
*next = to;
}
State::CheckLookAround { ref mut next, .. } => {
*next = to;
}
State::Union { ref mut alternates } => {
alternates.push(to);
self.memory_states += mem::size_of::<StateID>();
Expand All @@ -1173,6 +1228,7 @@ impl Builder {
State::CaptureEnd { ref mut next, .. } => {
*next = to;
}
State::WriteLookAround { .. } => {}
State::Fail => {}
State::Match { .. } => {}
}
Expand Down
Loading
Loading