Skip to content

Commit 4c452a2

Browse files
committed
Fix conservative computation of look-behind start offsets
1 parent 8589025 commit 4c452a2

File tree

4 files changed

+65
-27
lines changed

4 files changed

+65
-27
lines changed

regex-automata/src/nfa/thompson/builder.rs

Lines changed: 23 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -342,9 +342,10 @@ pub struct Builder {
342342
start_pattern: Vec<StateID>,
343343
/// The starting states for each individual look-behind sub-expression.
344344
start_look_behind: Vec<StateID>,
345-
/// The length (in bytes) of the longest string matched by any
346-
/// look-behind sub-expression. If `None`, the length is unbounded.
347-
maximum_look_behind_len: Option<usize>,
345+
/// Among all look-behinds, this is the furthest offset (in bytes) from
346+
/// the beginning of the main regex that a look-behind starts at.
347+
/// If `None`, the offset is unbounded.
348+
maximum_lookbehind_offset_from_start: Option<usize>,
348349
/// A map from pattern ID to capture group index to name. (If no name
349350
/// exists, then a None entry is present. Thus, all capturing groups are
350351
/// present in this mapping.)
@@ -377,7 +378,10 @@ pub struct Builder {
377378
impl Builder {
378379
/// Create a new builder for hand-assembling NFAs.
379380
pub fn new() -> Builder {
380-
Builder { maximum_look_behind_len: Some(0), ..Builder::default() }
381+
Builder {
382+
maximum_lookbehind_offset_from_start: Some(0),
383+
..Builder::default()
384+
}
381385
}
382386

383387
/// Clear this builder.
@@ -456,7 +460,9 @@ impl Builder {
456460

457461
nfa.set_starts(start_anchored, start_unanchored, &self.start_pattern);
458462
nfa.set_look_behind_starts(self.start_look_behind.as_slice());
459-
nfa.set_maximum_look_behind_len(self.maximum_look_behind_len);
463+
nfa.set_maximum_lookbehind_offset_from_start(
464+
self.maximum_lookbehind_offset_from_start,
465+
);
460466
nfa.set_captures(&self.captures).map_err(BuildError::captures)?;
461467
// The idea here is to convert our intermediate states to their final
462468
// form. The only real complexity here is the process of converting
@@ -715,21 +721,24 @@ impl Builder {
715721
}
716722

717723
/// Adds the `start_id` to the set of starting states that is used when
718-
/// running look-behind expressions. Additionally registers the maximum
719-
/// length (in bytes) that the sub-expression of the look-behind can match.
724+
/// running look-behind expressions. Additionally registers the furthest
725+
/// offset (in bytes) from the start of the main regex this look-behind
726+
/// starts.
720727
pub fn start_look_behind(
721728
&mut self,
722729
start_id: StateID,
723-
maximum_len: Option<usize>,
730+
offset_from_start: Option<usize>,
724731
) {
725732
self.start_look_behind.push(start_id);
726733

727-
self.maximum_look_behind_len =
728-
match (self.maximum_look_behind_len, maximum_len) {
729-
(Some(l1), Some(l2)) => Some(usize::max(l1, l2)),
730-
// A None subsumes the entire result.
731-
(None, _) | (_, None) => None,
732-
};
734+
self.maximum_lookbehind_offset_from_start = match (
735+
self.maximum_lookbehind_offset_from_start,
736+
offset_from_start,
737+
) {
738+
(Some(l1), Some(l2)) => Some(usize::max(l1, l2)),
739+
// A None subsumes the entire result.
740+
(None, _) | (_, None) => None,
741+
};
733742
}
734743

735744
/// Add an "empty" NFA state.

regex-automata/src/nfa/thompson/compiler.rs

Lines changed: 28 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -713,6 +713,11 @@ pub struct Compiler {
713713
utf8_suffix: RefCell<Utf8SuffixMap>,
714714
/// The next index to use for a look-around expression.
715715
lookaround_index: RefCell<SmallIndex>,
716+
/// How far from the beginning (in bytes) of the main regex does the
717+
/// current look-behind start at. This is updated when relativizing to
718+
/// the current look-behind expression. When `None`, the distance can be
719+
/// seen as infinity.
720+
current_lookbehind_offset_from_start: RefCell<Option<usize>>,
716721
}
717722

718723
impl Compiler {
@@ -726,6 +731,7 @@ impl Compiler {
726731
trie_state: RefCell::new(RangeTrie::new()),
727732
utf8_suffix: RefCell::new(Utf8SuffixMap::new(1000)),
728733
lookaround_index: RefCell::new(SmallIndex::ZERO),
734+
current_lookbehind_offset_from_start: RefCell::new(Some(0)),
729735
}
730736
}
731737

@@ -1021,10 +1027,13 @@ impl Compiler {
10211027
}
10221028
}
10231029

1030+
/// Compile a look-around expression as its own sub-automaton. Its starting
1031+
/// state is saved.
10241032
fn c_lookaround(
10251033
&self,
10261034
lookaround: &LookAround,
10271035
) -> Result<ThompsonRef, BuildError> {
1036+
// Assign a unique index for this look-around.
10281037
let idx = *self.lookaround_index.borrow();
10291038
*self.lookaround_index.borrow_mut() = SmallIndex::new(idx.one_more())
10301039
.map_err(|e| {
@@ -1036,14 +1045,31 @@ impl Compiler {
10361045
};
10371046
let check = self.add_check_lookaround(idx, pos)?;
10381047

1048+
// Compute the furthest offset from the start of the main regex
1049+
// where this look-around can begin at. We offset the current start
1050+
// offset by the maximal match length of the subexpression.
1051+
let maximum_len = lookaround.sub().properties().maximum_len();
1052+
let relative_start =
1053+
*self.current_lookbehind_offset_from_start.borrow();
1054+
let start_offset = match (relative_start, maximum_len) {
1055+
(Some(s), Some(l)) => Some(s + l),
1056+
(None, _) | (_, None) => None,
1057+
};
1058+
10391059
let unanchored =
10401060
self.c_at_least(&Hir::dot(hir::Dot::AnyByte), false, 0)?;
1041-
let maximum_len = lookaround.sub().properties().maximum_len();
10421061
self.builder
10431062
.borrow_mut()
1044-
.start_look_behind(unanchored.start, maximum_len);
1063+
.start_look_behind(unanchored.start, start_offset);
10451064

1065+
// When compiling the subexpression we temporarily change the starting
1066+
// offset and restore it after. This way, the subexpression is relativized
1067+
// to our current offset.
1068+
*self.current_lookbehind_offset_from_start.borrow_mut() = start_offset;
10461069
let sub = self.c(lookaround.sub())?;
1070+
*self.current_lookbehind_offset_from_start.borrow_mut() =
1071+
relative_start;
1072+
10471073
let write = self.add_write_lookaround(idx)?;
10481074
self.patch(unanchored.end, sub.start)?;
10491075
self.patch(sub.end, write)?;

regex-automata/src/nfa/thompson/nfa.rs

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1115,8 +1115,8 @@ impl NFA {
11151115
/// Returns the length (in bytes) of the longest string matched by any
11161116
/// look-behind sub-expression. If `None`, the length is unbounded.
11171117
#[inline]
1118-
pub fn maximum_look_behind_len(&self) -> Option<usize> {
1119-
self.0.maximum_look_behind_len
1118+
pub fn maximum_lookbehind_offset_from_start(&self) -> Option<usize> {
1119+
self.0.maximum_lookbehind_offset_from_start
11201120
}
11211121

11221122
// FIXME: The `look_set_prefix_all` computation was not correct, and it
@@ -1285,9 +1285,10 @@ pub(super) struct Inner {
12851285
lookaround_count: usize,
12861286
/// Contains the start states for each of the look-behind subexpressions.
12871287
start_look_behind: Vec<StateID>,
1288-
/// The length (in bytes) of the longest string matched by any
1289-
/// look-behind sub-expression. If `None`, the length is unbounded.
1290-
maximum_look_behind_len: Option<usize>,
1288+
/// Among all look-behinds, this is the furthest offset (in bytes) from
1289+
/// the beginning of the main regex that a look-behind starts at.
1290+
/// If `None`, the offset is unbounded.
1291+
maximum_lookbehind_offset_from_start: Option<usize>,
12911292
/// Heap memory used indirectly by NFA states and other things (like the
12921293
/// various capturing group representations above). Since each state
12931294
/// might use a different amount of heap, we need to keep track of this
@@ -1444,11 +1445,12 @@ impl Inner {
14441445
self.start_look_behind = look_behind_starts.to_vec();
14451446
}
14461447

1447-
pub(super) fn set_maximum_look_behind_len(
1448+
pub(super) fn set_maximum_lookbehind_offset_from_start(
14481449
&mut self,
1449-
maximum_look_behind_len: Option<usize>,
1450+
maximum_lookbehind_offset_from_start: Option<usize>,
14501451
) {
1451-
self.maximum_look_behind_len = maximum_look_behind_len;
1452+
self.maximum_lookbehind_offset_from_start =
1453+
maximum_lookbehind_offset_from_start;
14521454
}
14531455

14541456
/// Sets the UTF-8 mode of this NFA.

regex-automata/src/nfa/thompson/pikevm.rs

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1258,7 +1258,8 @@ impl PikeVM {
12581258
Some(config) => config,
12591259
};
12601260

1261-
let maximum_look_behind_len = self.nfa.maximum_look_behind_len();
1261+
let maximum_lookbehind_offset_from_start =
1262+
self.nfa.maximum_lookbehind_offset_from_start();
12621263

12631264
let pre =
12641265
if anchored { None } else { self.get_config().get_prefilter() };
@@ -1280,7 +1281,7 @@ impl PikeVM {
12801281
// start from 0.
12811282
let start_position = usize::saturating_sub(
12821283
input.start(),
1283-
maximum_look_behind_len.unwrap_or(input.start()),
1284+
maximum_lookbehind_offset_from_start.unwrap_or(input.start()),
12841285
);
12851286

12861287
// This initializes the look-behind threads from the `start_position`
@@ -1362,7 +1363,7 @@ impl PikeVM {
13621363
at,
13631364
usize::saturating_sub(
13641365
span.start,
1365-
maximum_look_behind_len
1366+
maximum_lookbehind_offset_from_start
13661367
.unwrap_or(span.start),
13671368
),
13681369
);

0 commit comments

Comments
 (0)