Skip to content

Commit 7eb9594

Browse files
committed
Keep track of maximum look-behind length
1 parent b97fb5a commit 7eb9594

File tree

3 files changed

+40
-4
lines changed

3 files changed

+40
-4
lines changed

regex-automata/src/nfa/thompson/builder.rs

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -342,6 +342,9 @@ pub struct Builder {
342342
start_pattern: Vec<StateID>,
343343
/// The starting states for each individual look-behind sub-expression.
344344
start_look_behind: Vec<StateID>,
345+
/// The length (in bytes) of the longest string matched by any
346+
/// look-behind sub-expression. If `None`, the length is unbounded.
347+
maximum_look_behind_len: Option<usize>,
345348
/// A map from pattern ID to capture group index to name. (If no name
346349
/// exists, then a None entry is present. Thus, all capturing groups are
347350
/// present in this mapping.)
@@ -374,7 +377,7 @@ pub struct Builder {
374377
impl Builder {
375378
/// Create a new builder for hand-assembling NFAs.
376379
pub fn new() -> Builder {
377-
Builder::default()
380+
Builder { maximum_look_behind_len: Some(0), ..Builder::default() }
378381
}
379382

380383
/// Clear this builder.
@@ -453,6 +456,7 @@ impl Builder {
453456

454457
nfa.set_starts(start_anchored, start_unanchored, &self.start_pattern);
455458
nfa.set_look_behind_starts(self.start_look_behind.as_slice());
459+
nfa.set_maximum_look_behind_len(self.maximum_look_behind_len);
456460
nfa.set_captures(&self.captures).map_err(BuildError::captures)?;
457461
// The idea here is to convert our intermediate states to their final
458462
// form. The only real complexity here is the process of converting
@@ -711,9 +715,21 @@ impl Builder {
711715
}
712716

713717
/// Adds the `start_id` to the set of starting states that is used when
714-
/// running look-behind expressions.
715-
pub fn start_look_behind(&mut self, start_id: StateID) {
718+
/// running look-behind expressions. Additionally registers the maximum
719+
/// length (in bytes) that the sub-expression of the look-behind can match.
720+
pub fn start_look_behind(
721+
&mut self,
722+
start_id: StateID,
723+
maximum_len: Option<usize>,
724+
) {
716725
self.start_look_behind.push(start_id);
726+
727+
self.maximum_look_behind_len =
728+
match (self.maximum_look_behind_len, maximum_len) {
729+
(Some(l1), Some(l2)) => Some(usize::max(l1, l2)),
730+
// A None subsumes the entire result.
731+
(None, _) | (_, None) => None,
732+
};
717733
}
718734

719735
/// Add an "empty" NFA state.

regex-automata/src/nfa/thompson/compiler.rs

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1039,7 +1039,10 @@ impl Compiler {
10391039

10401040
let unanchored =
10411041
self.c_at_least(&Hir::dot(hir::Dot::AnyByte), false, 0)?;
1042-
self.builder.borrow_mut().start_look_behind(unanchored.start);
1042+
let maximum_len = lookaround.sub().properties().maximum_len();
1043+
self.builder
1044+
.borrow_mut()
1045+
.start_look_behind(unanchored.start, maximum_len);
10431046

10441047
let sub = self.c(lookaround.sub())?;
10451048
let write = self.add_write_lookaround(idx)?;

regex-automata/src/nfa/thompson/nfa.rs

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1112,6 +1112,13 @@ impl NFA {
11121112
&self.0.start_look_behind
11131113
}
11141114

1115+
/// Returns the length (in bytes) of the longest string matched by any
1116+
/// look-behind sub-expression. If `None`, the length is unbounded.
1117+
#[inline]
1118+
pub fn maximum_look_behind_len(&self) -> Option<usize> {
1119+
self.0.maximum_look_behind_len
1120+
}
1121+
11151122
// FIXME: The `look_set_prefix_all` computation was not correct, and it
11161123
// seemed a little tricky to fix it. Since I wasn't actually using it for
11171124
// anything, I just decided to remove it in the run up to the regex 1.9
@@ -1279,6 +1286,9 @@ pub(super) struct Inner {
12791286
lookaround_count: usize,
12801287
/// Contains the start state for each of the look-behind subexpressions.
12811288
start_look_behind: Vec<StateID>,
1289+
/// The length (in bytes) of the longest string matched by any
1290+
/// look-behind sub-expression. If `None`, the length is unbounded.
1291+
maximum_look_behind_len: Option<usize>,
12821292
/// Heap memory used indirectly by NFA states and other things (like the
12831293
/// various capturing group representations above). Since each state
12841294
/// might use a different amount of heap, we need to keep track of this
@@ -1435,6 +1445,13 @@ impl Inner {
14351445
self.start_look_behind = look_behind_starts.to_vec();
14361446
}
14371447

1448+
pub(super) fn set_maximum_look_behind_len(
1449+
&mut self,
1450+
maximum_look_behind_len: Option<usize>,
1451+
) {
1452+
self.maximum_look_behind_len = maximum_look_behind_len;
1453+
}
1454+
14381455
/// Sets the UTF-8 mode of this NFA.
14391456
pub(super) fn set_utf8(&mut self, yes: bool) {
14401457
self.utf8 = yes;

0 commit comments

Comments
 (0)