Skip to content

Commit 87730fc

Browse files
committed
Store look-behind offsets separately
1 parent 9e278fe commit 87730fc

File tree

4 files changed

+242
-162
lines changed

4 files changed

+242
-162
lines changed

regex-automata/src/nfa/thompson/builder.rs

Lines changed: 14 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ use alloc::{sync::Arc, vec, vec::Vec};
55
use crate::{
66
nfa::thompson::{
77
error::BuildError,
8-
nfa::{self, LookBehindInfo, SparseTransitions, Transition, NFA},
8+
nfa::{self, LookBehindTree, SparseTransitions, Transition, NFA},
99
},
1010
util::{
1111
look::{Look, LookMatcher},
@@ -340,11 +340,9 @@ pub struct Builder {
340340
/// contains a single regex, then `start_pattern[0]` and `start_anchored`
341341
/// are always equivalent.
342342
start_pattern: Vec<StateID>,
343-
/// A vector of meta-data information about each look-behind in this NFA.
344-
///
345-
/// Must be stored in a depth-first pre-order with regards to the nesting
346-
/// of look-behinds.
347-
lookbehinds: Vec<LookBehindInfo>,
343+
/// A vector of look-behinds appearing in the regex. Order reflects the
344+
/// order in the regex.
345+
lookbehinds: Vec<LookBehindTree>,
348346
/// A map from pattern ID to capture group index to name. (If no name
349347
/// exists, then a None entry is present. Thus, all capturing groups are
350348
/// present in this mapping.)
@@ -719,14 +717,21 @@ impl Builder {
719717
/// starts.
720718
///
721719
/// Look-behinds must be started in a depth-first pre-order fashion with
722-
/// regards to the nesting of look-behinds.
720+
/// regards to the nesting of look-behinds. The nesting path is stored
721+
/// as indices in `path`.
723722
pub fn start_lookbehind(
724723
&mut self,
725724
start_id: StateID,
726725
offset_from_start: Option<usize>,
726+
path: &[usize],
727727
) {
728-
self.lookbehinds
729-
.push(LookBehindInfo::new(start_id, offset_from_start));
728+
let mut current = &mut self.lookbehinds;
729+
730+
for index in path {
731+
current = current[*index].children_mut();
732+
}
733+
734+
current.push(LookBehindTree::new(start_id, offset_from_start));
730735
}
731736

732737
/// Add an "empty" NFA state.

regex-automata/src/nfa/thompson/compiler.rs

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -718,6 +718,8 @@ pub struct Compiler {
718718
/// the current look-behind expression. When `None`, the distance can be
719719
/// seen as infinity.
720720
current_lookbehind_offset_from_start: RefCell<Option<usize>>,
721+
/// The current path of look-behind nesting.
722+
lookbehind_nesting_path: RefCell<Vec<usize>>,
721723
}
722724

723725
impl Compiler {
@@ -732,6 +734,7 @@ impl Compiler {
732734
utf8_suffix: RefCell::new(Utf8SuffixMap::new(1000)),
733735
lookaround_index: RefCell::new(SmallIndex::ZERO),
734736
current_lookbehind_offset_from_start: RefCell::new(Some(0)),
737+
lookbehind_nesting_path: RefCell::new(vec![0]),
735738
}
736739
}
737740

@@ -971,6 +974,9 @@ impl Compiler {
971974
self.builder
972975
.borrow_mut()
973976
.set_size_limit(self.config.get_nfa_size_limit())?;
977+
*self.lookaround_index.borrow_mut() = SmallIndex::ZERO;
978+
*self.lookbehind_nesting_path.borrow_mut() = vec![0];
979+
*self.current_lookbehind_offset_from_start.borrow_mut() = Some(0);
974980

975981
// We always add an unanchored prefix unless we were specifically told
976982
// not to (for tests only), or if we know that the regex is anchored
@@ -1058,15 +1064,22 @@ impl Compiler {
10581064

10591065
let unanchored =
10601066
self.c_at_least(&Hir::dot(hir::Dot::AnyByte), false, 0)?;
1061-
self.builder
1062-
.borrow_mut()
1063-
.start_lookbehind(unanchored.start, start_offset);
1067+
self.builder.borrow_mut().start_lookbehind(
1068+
unanchored.start,
1069+
start_offset,
1070+
self.lookbehind_nesting_path.borrow().split_last().unwrap().1,
1071+
);
10641072

10651073
// When compiling the subexpression we temporarily change the starting
10661074
// offset and restore it after. This way, the subexpression is relativized
1067-
// to our current offset.
1075+
// to our current offset. We also update the path to the current lookbehind
1076+
// expression.
1077+
self.lookbehind_nesting_path.borrow_mut().push(0);
10681078
*self.current_lookbehind_offset_from_start.borrow_mut() = start_offset;
10691079
let sub = self.c(lookaround.sub())?;
1080+
let mut path = self.lookbehind_nesting_path.borrow_mut();
1081+
path.pop();
1082+
*path.last_mut().unwrap() += 1;
10701083
*self.current_lookbehind_offset_from_start.borrow_mut() =
10711084
relative_start;
10721085

regex-automata/src/nfa/thompson/nfa.rs

Lines changed: 70 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1108,7 +1108,7 @@ impl NFA {
11081108

11091109
/// Returns the starting states for initializing look-behind evaluation.
11101110
#[inline]
1111-
pub fn lookbehinds(&self) -> &Vec<LookBehindInfo> {
1111+
pub fn lookbehinds(&self) -> &[LookBehindTree] {
11121112
&self.0.lookbehinds
11131113
}
11141114

@@ -1276,46 +1276,83 @@ pub(super) struct Inner {
12761276
/// This is needed to initialize the table for storing the result of
12771277
/// look-around evaluation.
12781278
lookaround_count: usize,
1279-
/// A vector of meta-data information about each look-behind in this NFA.
1280-
///
1281-
/// Must be stored in a depth-first pre-order with regards to the nesting
1282-
/// of look-behinds.
1283-
lookbehinds: Vec<LookBehindInfo>,
1279+
/// A vector of look-behinds appearing in the regex. Order reflects the
1280+
/// order in the regex.
1281+
lookbehinds: Vec<LookBehindTree>,
12841282
/// Heap memory used indirectly by NFA states and other things (like the
12851283
/// various capturing group representations above). Since each state
12861284
/// might use a different amount of heap, we need to keep track of this
12871285
/// incrementally.
12881286
memory_extra: usize,
12891287
}
12901288

1291-
/// Information about a look-behind needed for execution.
1292-
#[derive(Clone, Copy, Debug)]
1293-
pub struct LookBehindInfo {
1294-
/// The id of the start state of the look-behind subexpression.
1289+
/// Information about a look-behinds needed for execution. It preserves the
1290+
/// nesting structure of look-behinds.
1291+
#[derive(Clone, Debug)]
1292+
pub struct LookBehindTree {
12951293
start_id: StateID,
1296-
/// The offset (in bytes) from the beginning of the main regex that a
1297-
/// look-behind starts at. If `None`, the offset is unbounded.
12981294
offset_from_start: Option<usize>,
1295+
children: Vec<LookBehindTree>,
12991296
}
13001297

1301-
impl LookBehindInfo {
1302-
pub(super) fn new(
1303-
start_id: StateID,
1304-
offset_from_start: Option<usize>,
1305-
) -> Self {
1306-
Self { start_id, offset_from_start }
1298+
impl LookBehindTree {
1299+
pub fn new(start_id: StateID, offset_from_start: Option<usize>) -> Self {
1300+
Self { start_id, offset_from_start, children: Vec::new() }
13071301
}
13081302

1309-
/// Start states of the look-behind subexpression.
1310-
pub(super) fn start_state(&self) -> StateID {
1303+
/// The id of the start state of the look-behind subexpression.
1304+
pub fn start_id(&self) -> StateID {
13111305
self.start_id
13121306
}
13131307

13141308
/// The offset (in bytes) from the beginning of the main regex that a
13151309
/// look-behind starts at. If `None`, the offset is unbounded.
1316-
pub(super) fn offset_from_start(&self) -> Option<usize> {
1310+
pub fn offset_from_start(&self) -> Option<usize> {
13171311
self.offset_from_start
13181312
}
1313+
1314+
/// The look-behinds this look-behind contains. Order reflects the order
1315+
/// in the regex.
1316+
pub fn children(&self) -> &[LookBehindTree] {
1317+
&self.children
1318+
}
1319+
1320+
/// Calls `fun` on this look-behind tree and all of its children in pre-order.
1321+
/// `fun` should return `true` if the traversal should continue and `false`
1322+
/// if it should stop.
1323+
///
1324+
/// The return value indicates whether the traversal was at any point stopped.
1325+
pub fn preorder(&self, fun: &impl Fn(&LookBehindTree) -> bool) -> bool {
1326+
if !fun(self) {
1327+
return false;
1328+
}
1329+
for child in &self.children {
1330+
if !child.preorder(fun) {
1331+
return false;
1332+
}
1333+
}
1334+
true
1335+
}
1336+
1337+
/// Like [`preorder`], but allows mutating the nodes.
1338+
pub fn preorder_mut(
1339+
&mut self,
1340+
fun: &impl Fn(&mut LookBehindTree) -> bool,
1341+
) -> bool {
1342+
if !fun(self) {
1343+
return false;
1344+
}
1345+
for child in &mut self.children {
1346+
if !child.preorder_mut(fun) {
1347+
return false;
1348+
}
1349+
}
1350+
true
1351+
}
1352+
1353+
pub fn children_mut(&mut self) -> &mut Vec<LookBehindTree> {
1354+
&mut self.children
1355+
}
13191356
}
13201357

13211358
impl Inner {
@@ -1464,7 +1501,7 @@ impl Inner {
14641501
///
14651502
/// The slice must be in a depth-first pre-order with regards to the
14661503
/// nesting of look-behinds.
1467-
pub(super) fn set_lookbehinds(&mut self, lookbehinds: &[LookBehindInfo]) {
1504+
pub(super) fn set_lookbehinds(&mut self, lookbehinds: &[LookBehindTree]) {
14681505
self.lookbehinds = lookbehinds.to_vec();
14691506
}
14701507

@@ -1521,9 +1558,12 @@ impl Inner {
15211558
for id in self.start_pattern.iter_mut() {
15221559
*id = old_to_new[*id];
15231560
}
1524-
for LookBehindInfo { start_id: id, .. } in self.lookbehinds.iter_mut()
1525-
{
1526-
*id = old_to_new[*id];
1561+
1562+
for lbs in self.lookbehinds.iter_mut() {
1563+
lbs.preorder_mut(&|e| {
1564+
e.start_id = old_to_new[e.start_id];
1565+
true
1566+
});
15271567
}
15281568
}
15291569
}
@@ -1536,7 +1576,11 @@ impl fmt::Debug for Inner {
15361576
'^'
15371577
} else if sid == self.start_unanchored {
15381578
'>'
1539-
} else if self.lookbehinds.iter().any(|i| i.start_state() == sid) {
1579+
} else if self
1580+
.lookbehinds
1581+
.iter()
1582+
.any(|i| !i.preorder(&|e| e.start_id() != sid))
1583+
{
15401584
'<'
15411585
} else {
15421586
' '

0 commit comments

Comments
 (0)