Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 23 additions & 3 deletions regex-automata/src/meta/regex.rs
Original file line number Diff line number Diff line change
Expand Up @@ -611,7 +611,8 @@ impl Regex {
&'r self,
input: I,
) -> FindMatches<'r, 'h> {
let cache = self.pool.get();
let mut cache = self.pool.get();
cache.keep_lookaround_state(true);
let it = iter::Searcher::new(input.into());
FindMatches { re: self, cache, it }
}
Expand Down Expand Up @@ -652,7 +653,8 @@ impl Regex {
&'r self,
input: I,
) -> CapturesMatches<'r, 'h> {
let cache = self.pool.get();
let mut cache = self.pool.get();
cache.keep_lookaround_state(true);
let caps = self.create_captures();
let it = iter::Searcher::new(input.into());
CapturesMatches { re: self, cache, caps, it }
Expand Down Expand Up @@ -2076,7 +2078,11 @@ impl<'r, 'h> Iterator for FindMatches<'r, 'h> {
#[inline]
fn next(&mut self) -> Option<Match> {
let FindMatches { re, ref mut cache, ref mut it } = *self;
it.advance(|input| Ok(re.search_with(cache, input)))
let result = it.advance(|input| Ok(re.search_with(cache, input)));
if result.is_none() {
cache.keep_lookaround_state(false);
}
result
}

#[inline]
Expand Down Expand Up @@ -2149,6 +2155,7 @@ impl<'r, 'h> Iterator for CapturesMatches<'r, 'h> {
if caps.is_match() {
Some(caps.clone())
} else {
cache.keep_lookaround_state(false);
None
}
}
Expand Down Expand Up @@ -2385,6 +2392,19 @@ impl Cache {
re.imp.strat.reset_cache(self)
}

/// Set this cache to keep the state of look-behind assertions upon a
/// match being found.
///
/// This must only be called with a value of `true` when a new search is
/// started at the end of a previously found match, otherwise the result
/// of any search after this call will most likely be wrong.
///
/// Calling this function with a value of `false` will clear any previously
/// stored look-behind state.
pub fn keep_lookaround_state(&mut self, keep: bool) {
self.pikevm.keep_lookaround_state(keep);
}

/// Returns the heap memory usage, in bytes, of this cache.
///
/// This does **not** include the stack size used up by this cache. To
Expand Down
6 changes: 6 additions & 0 deletions regex-automata/src/meta/wrappers.rs
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,12 @@ impl PikeVMCache {
PikeVMCache(Some(builder.get().0.create_cache()))
}

pub(crate) fn keep_lookaround_state(&mut self, keep: bool) {
if let Some(cache) = self.0.as_mut() {
cache.keep_lookaround_state(keep);
}
}

pub(crate) fn reset(&mut self, builder: &PikeVM) {
self.0.as_mut().unwrap().reset(&builder.get().0);
}
Expand Down
2 changes: 1 addition & 1 deletion regex-automata/src/nfa/thompson/compiler.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1038,12 +1038,12 @@ impl Compiler {

let unanchored =
self.c_at_least(&Hir::dot(hir::Dot::AnyByte), false, 0)?;
self.builder.borrow_mut().start_look_behind(unanchored.start);

let sub = self.c(lookaround.sub())?;
let write = self.add_write_lookaround(idx)?;
self.patch(unanchored.end, sub.start)?;
self.patch(sub.end, write)?;
self.builder.borrow_mut().start_look_behind(unanchored.start);
Ok(ThompsonRef { start: check, end: check })
}

Expand Down
127 changes: 92 additions & 35 deletions regex-automata/src/nfa/thompson/pikevm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -891,6 +891,7 @@ impl PikeVM {
cache: &'c mut Cache,
input: I,
) -> FindMatches<'r, 'c, 'h> {
cache.keep_lookaround_state(true);
let caps = Captures::matches(self.get_nfa().group_info().clone());
let it = iter::Searcher::new(input.into());
FindMatches { re: self, cache, caps, it }
Expand Down Expand Up @@ -934,6 +935,7 @@ impl PikeVM {
cache: &'c mut Cache,
input: I,
) -> CapturesMatches<'r, 'c, 'h> {
cache.keep_lookaround_state(true);
let caps = self.create_captures();
let it = iter::Searcher::new(input.into());
CapturesMatches { re: self, cache, caps, it }
Expand Down Expand Up @@ -1265,42 +1267,50 @@ impl PikeVM {
ref mut lookaround,
ref mut curr_lookaround,
ref mut next_lookaround,
ref mut match_lookaround,
ref keep_lookaround_state,
} = cache;

// This initializes the look-behind threads from the start of the input
// Note: since capture groups are not allowed inside look-behinds,
// there won't be any Capture epsilon transitions and hence it is ok to
// use &mut [] for the slots parameter. We need to add the start states
// in reverse because nested look-behinds have a higher index but must
// be executed first.
for look_behind_start in self.nfa.look_behind_starts() {
self.epsilon_closure(
stack,
&mut [],
curr_lookaround,
lookaround,
input,
0,
*look_behind_start,
);
}
if let Some(active) = match_lookaround {
*curr_lookaround = active.clone();
} else if self.lookaround_count() > 0 {
// This initializes the look-behind threads from the start of the input
// Note: since capture groups are not allowed inside look-behinds,
// there won't be any Capture epsilon transitions and hence it is ok to
// use &mut [] for the slots parameter. We need to add the start states
// in reverse because more deeply nested look-behinds have a higher index
// but must be executed first, so that the result is available for the
// outer expression.
for look_behind_start in self.nfa.look_behind_starts().iter().rev()
{
self.epsilon_closure(
stack,
&mut [],
curr_lookaround,
lookaround,
input,
0,
*look_behind_start,
);
}

// This brings the look-behind threads into the state they must be for
// starting at input.start() instead of the beginning. This is
// necessary for look-behinds to be able to match outside of the input
// span.
for lb_at in 0..input.start() {
self.nexts(
stack,
curr_lookaround,
next_lookaround,
lookaround,
input,
lb_at,
&mut [],
);
core::mem::swap(curr_lookaround, next_lookaround);
next_lookaround.set.clear();
// This brings the look-behind threads into the state they must be for
// starting at input.start() instead of the beginning. This is
// necessary for lookbehinds to be able to match outside of the input
// span.
for lb_at in 0..input.start() {
self.nexts(
stack,
curr_lookaround,
next_lookaround,
lookaround,
input,
lb_at,
&mut [],
);
core::mem::swap(curr_lookaround, next_lookaround);
next_lookaround.set.clear();
}
}

let mut hm = None;
Expand Down Expand Up @@ -1428,6 +1438,9 @@ impl PikeVM {
self.nexts(stack, curr, next, lookaround, input, at, slots)
{
hm = Some(HalfMatch::new(pid, at));
if *keep_lookaround_state {
*match_lookaround = Some(curr_lookaround.clone());
}
}
// Unless the caller asked us to return early, we need to mush on
// to see if we can extend our match. (But note that 'nexts' will
Expand Down Expand Up @@ -1496,6 +1509,11 @@ impl PikeVM {
ref mut lookaround,
ref mut curr_lookaround,
ref mut next_lookaround,
// It makes no sense to keep any look-behind state for this version of
// the search, since the caller receives no information about
// where the search ended.
keep_lookaround_state: _,
match_lookaround: _,
} = cache;

for look_behind_start in self.nfa.look_behind_starts() {
Expand Down Expand Up @@ -1989,10 +2007,14 @@ impl<'r, 'c, 'h> Iterator for FindMatches<'r, 'c, 'h> {
*self;
// 'advance' converts errors into panics, which is OK here because
// the PikeVM can never return an error.
it.advance(|input| {
let result = it.advance(|input| {
re.search(cache, input, caps);
Ok(caps.get_match())
})
});
if result.is_none() {
cache.keep_lookaround_state(false);
}
result
}
}

Expand Down Expand Up @@ -2034,6 +2056,7 @@ impl<'r, 'c, 'h> Iterator for CapturesMatches<'r, 'c, 'h> {
if caps.is_match() {
Some(caps.clone())
} else {
cache.keep_lookaround_state(false);
None
}
}
Expand Down Expand Up @@ -2070,6 +2093,14 @@ pub struct Cache {
curr_lookaround: ActiveStates,
/// The next set of states to be explored for look-behind subexpressions.
next_lookaround: ActiveStates,
/// The set of active threads, belonging to look-behind expressions,
/// when a match was found. This is needed to resume a search after a match
/// was found (to look for further matches), without having to re-scan the
/// beginning of the haystack.
match_lookaround: Option<ActiveStates>,
/// When true, use the states of `match_lookaround` to initialize a search,
/// otherwise recompute from the beginning of the haystack.
keep_lookaround_state: bool,
}

impl Cache {
Expand All @@ -2089,6 +2120,8 @@ impl Cache {
lookaround: vec![None; re.lookaround_count()],
curr_lookaround: ActiveStates::new(re),
next_lookaround: ActiveStates::new(re),
match_lookaround: None,
keep_lookaround_state: false,
}
}

Expand Down Expand Up @@ -2135,6 +2168,25 @@ impl Cache {
self.curr_lookaround.reset(re);
self.next_lookaround.reset(re);
self.lookaround = vec![None; re.lookaround_count()];
self.match_lookaround = None;
self.keep_lookaround_state = false;
}

/// Set this cache to store a copy of the active threads belonging
/// to look-behind assertions upon a match being found.
///
/// This is a performance optimization and must only be called with a
/// value of `true` when intending to start a new search at the end of
/// a previously found match. Otherwise, the result of look-behind
/// sub-expressions will be out of sync with the main regex.
///
/// Calling this function with a value of `false` will clear any previously
/// stored look-behind state.
pub fn keep_lookaround_state(&mut self, keep: bool) {
self.keep_lookaround_state = keep;
if !keep {
self.match_lookaround = None;
}
}

/// Returns the heap memory usage, in bytes, of this cache.
Expand All @@ -2143,11 +2195,16 @@ impl Cache {
/// compute that, use `std::mem::size_of::<Cache>()`.
pub fn memory_usage(&self) -> usize {
use core::mem::size_of;
let match_lookaround_memory = match &self.match_lookaround {
Some(ml) => ml.memory_usage(),
None => 0,
};
(self.stack.len() * size_of::<FollowEpsilon>())
+ self.curr.memory_usage()
+ self.next.memory_usage()
+ self.curr_lookaround.memory_usage()
+ self.next_lookaround.memory_usage()
+ match_lookaround_memory
}

/// Clears this cache. This should be called at the start of every search
Expand Down