Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions regex-automata/src/nfa/thompson/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -340,6 +340,8 @@ pub struct Builder {
/// contains a single regex, then `start_pattern[0]` and `start_anchored`
/// are always equivalent.
start_pattern: Vec<StateID>,
/// The starting states for each individual look-behind sub-expression.
start_look_behind: Vec<StateID>,
/// A map from pattern ID to capture group index to name. (If no name
/// exists, then a None entry is present. Thus, all capturing groups are
/// present in this mapping.)
Expand Down Expand Up @@ -385,6 +387,7 @@ impl Builder {
self.pattern_id = None;
self.states.clear();
self.start_pattern.clear();
self.start_look_behind.clear();
self.captures.clear();
self.memory_states = 0;
}
Expand Down Expand Up @@ -449,6 +452,7 @@ impl Builder {
remap.resize(self.states.len(), StateID::ZERO);

nfa.set_starts(start_anchored, start_unanchored, &self.start_pattern);
nfa.set_look_behind_starts(self.start_look_behind.as_slice());
nfa.set_captures(&self.captures).map_err(BuildError::captures)?;
// The idea here is to convert our intermediate states to their final
// form. The only real complexity here is the process of converting
Expand Down Expand Up @@ -706,6 +710,12 @@ impl Builder {
self.start_pattern.len()
}

/// Adds the `start_id` to the set of starting states that is used when
/// running look-behind expressions.
pub fn start_look_behind(&mut self, start_id: StateID) {
self.start_look_behind.push(start_id);
}

/// Add an "empty" NFA state.
///
/// An "empty" NFA state is a state with a single unconditional epsilon
Expand Down
79 changes: 25 additions & 54 deletions regex-automata/src/nfa/thompson/compiler.rs
Original file line number Diff line number Diff line change
Expand Up @@ -711,11 +711,6 @@ pub struct Compiler {
/// State used for caching common suffixes when compiling reverse UTF-8
/// automata (for Unicode character classes).
utf8_suffix: RefCell<Utf8SuffixMap>,
/// Top level alternation state which is used to run all look-around
/// assertion checks in lockstep with the main expression. Each look-around
/// expression is compiled to a set of states that is patched into this
/// state, and this state is updated on each new pattern being compiled.
lookaround_alt: RefCell<Option<StateID>>,
/// The next index to use for a look-around expression.
lookaround_index: RefCell<SmallIndex>,
}
Expand All @@ -730,7 +725,6 @@ impl Compiler {
utf8_state: RefCell::new(Utf8State::new()),
trie_state: RefCell::new(RangeTrie::new()),
utf8_suffix: RefCell::new(Utf8SuffixMap::new(1000)),
lookaround_alt: RefCell::new(None),
lookaround_index: RefCell::new(SmallIndex::ZERO),
}
}
Expand Down Expand Up @@ -993,32 +987,11 @@ impl Compiler {

let compiled = self.c_alt_iter(exprs.iter().map(|e| {
let _ = self.start_pattern()?;
let has_lookarounds =
(e.borrow() as &Hir).properties().contains_lookaround_expr();
let mut top_level_alt = if has_lookarounds {
self.add_union()?
} else {
StateID::ZERO
};
if has_lookarounds {
let lookaround_prefix =
self.c_at_least(&Hir::dot(hir::Dot::AnyByte), false, 0)?;
let lookaround_alt = self.add_union()?;
self.patch(lookaround_prefix.end, lookaround_alt)?;
self.patch(top_level_alt, lookaround_prefix.start)?;
self.lookaround_alt.borrow_mut().replace(lookaround_alt);
}
let one = self.c_cap(0, None, e.borrow())?;
let match_state_id = self.add_match()?;
self.patch(one.end, match_state_id)?;
if has_lookarounds {
self.patch(top_level_alt, one.start)?;
} else {
top_level_alt = one.start;
}
let _ = self.finish_pattern(top_level_alt)?;
self.lookaround_alt.borrow_mut().take();
Ok(ThompsonRef { start: top_level_alt, end: match_state_id })
let _ = self.finish_pattern(one.start)?;
Ok(ThompsonRef { start: one.start, end: match_state_id })
}))?;
self.patch(unanchored_prefix.end, compiled.start)?;
let nfa = self
Expand Down Expand Up @@ -1052,25 +1025,25 @@ impl Compiler {
&self,
lookaround: &LookAround,
) -> Result<ThompsonRef, BuildError> {
let sub = self.c(lookaround.sub())?;
let pos = match lookaround {
LookAround::NegativeLookBehind(_) => false,
LookAround::PositiveLookBehind(_) => true,
};
let idx = *self.lookaround_index.borrow();
*self.lookaround_index.borrow_mut() = SmallIndex::new(idx.one_more())
.map_err(|e| {
BuildError::too_many_lookarounds(e.attempted() as usize)
})?;
let pos = match lookaround {
LookAround::NegativeLookBehind(_) => false,
LookAround::PositiveLookBehind(_) => true,
};
let check = self.add_check_lookaround(idx, pos)?;

let unanchored =
self.c_at_least(&Hir::dot(hir::Dot::AnyByte), false, 0)?;

let sub = self.c(lookaround.sub())?;
let write = self.add_write_lookaround(idx)?;
self.patch(unanchored.end, sub.start)?;
self.patch(sub.end, write)?;
self.patch(
self.lookaround_alt
.borrow()
.expect("Cannot compile look-around outside pattern"),
sub.start,
)?;
self.builder.borrow_mut().start_look_behind(unanchored.start);
Ok(ThompsonRef { start: check, end: check })
}

Expand Down Expand Up @@ -2169,13 +2142,12 @@ mod tests {
&[
s_bin_union(2, 1),
s_range(0, 255, 0),
s_bin_union(3, 6),
s_check_lookaround(0, true, 7),
s_bin_union(5, 4),
s_range(0, 255, 3),
s_look(Look::Start, 7),
s_check_lookaround(0, true, 8),
s_look(Look::Start, 6),
s_write_lookaround(0),
s_byte(b'a', 9),
s_byte(b'a', 8),
s_match(0)
]
);
Expand Down Expand Up @@ -2310,28 +2282,27 @@ mod tests {
assert_eq!(
build(r"(?<=a)").states(),
&[
s_bin_union(1, 4),
s_check_lookaround(0, true, 5),
s_bin_union(3, 2),
s_range(b'\x00', b'\xFF', 1),
s_byte(b'a', 5),
s_check_lookaround(0, true, 6),
s_byte(b'a', 4),
s_write_lookaround(0),
s_match(0)
]
);
assert_eq!(
build(r"(?<=a(?<!b))").states(),
&[
s_bin_union(1, 8),
s_check_lookaround(0, true, 10),
s_bin_union(3, 2),
s_range(b'\x00', b'\xFF', 1),
s_bin_union(5, 4),
s_byte(b'a', 6),
s_byte(b'b', 7),
s_check_lookaround(0, false, 9),
s_write_lookaround(0),
s_check_lookaround(1, true, 10),
s_byte(b'a', 4),
s_check_lookaround(1, false, 9),
s_bin_union(7, 6),
s_range(b'\x00', b'\xFF', 5),
s_byte(b'b', 8),
s_write_lookaround(1),
s_write_lookaround(0),
s_match(0)
]
);
Expand Down
20 changes: 20 additions & 0 deletions regex-automata/src/nfa/thompson/nfa.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1106,6 +1106,12 @@ impl NFA {
self.0.lookaround_count
}

/// Returns the starting states for initializing look-behind evaluation.
#[inline]
pub fn look_behind_starts(&self) -> &Vec<StateID> {
&self.0.start_look_behind
}

// FIXME: The `look_set_prefix_all` computation was not correct, and it
// seemed a little tricky to fix it. Since I wasn't actually using it for
// anything, I just decided to remove it in the run up to the regex 1.9
Expand Down Expand Up @@ -1270,6 +1276,8 @@ pub(super) struct Inner {
/// This is needed to initialize the table for storing the result of
/// look-around evaluation.
lookaround_count: usize,
/// Contains the start states for each of the look-behind subexpressions.
start_look_behind: Vec<StateID>,
/// Heap memory used indirectly by NFA states and other things (like the
/// various capturing group representations above). Since each state
/// might use a different amount of heap, we need to keep track of this
Expand Down Expand Up @@ -1419,6 +1427,13 @@ impl Inner {
self.start_pattern = start_pattern.to_vec();
}

pub(super) fn set_look_behind_starts(
&mut self,
look_behind_starts: &[StateID],
) {
self.start_look_behind = look_behind_starts.to_vec();
}

/// Sets the UTF-8 mode of this NFA.
pub(super) fn set_utf8(&mut self, yes: bool) {
self.utf8 = yes;
Expand Down Expand Up @@ -1472,6 +1487,9 @@ impl Inner {
for id in self.start_pattern.iter_mut() {
*id = old_to_new[*id];
}
for id in self.start_look_behind.iter_mut() {
*id = old_to_new[*id];
}
}
}

Expand All @@ -1483,6 +1501,8 @@ impl fmt::Debug for Inner {
'^'
} else if sid == self.start_unanchored {
'>'
} else if self.start_look_behind.contains(&sid) {
'<'
} else {
' '
};
Expand Down
Loading