Skip to content

Commit 9e278fe

Browse files
committed
Generalize the NFA compiler info and specialize the PikeVM
1 parent 4c452a2 commit 9e278fe

File tree

4 files changed

+97
-76
lines changed

4 files changed

+97
-76
lines changed

regex-automata/src/nfa/thompson/builder.rs

Lines changed: 15 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ use alloc::{sync::Arc, vec, vec::Vec};
55
use crate::{
66
nfa::thompson::{
77
error::BuildError,
8-
nfa::{self, SparseTransitions, Transition, NFA},
8+
nfa::{self, LookBehindInfo, SparseTransitions, Transition, NFA},
99
},
1010
util::{
1111
look::{Look, LookMatcher},
@@ -340,12 +340,11 @@ pub struct Builder {
340340
/// contains a single regex, then `start_pattern[0]` and `start_anchored`
341341
/// are always equivalent.
342342
start_pattern: Vec<StateID>,
343-
/// The starting states for each individual look-behind sub-expression.
344-
start_look_behind: Vec<StateID>,
345-
/// Among all look-behinds, this is the furthest offset (in bytes) from
346-
/// the beginning of the main regex that a look-behind starts at.
347-
/// If `None`, the offset is unbounded.
348-
maximum_lookbehind_offset_from_start: Option<usize>,
343+
/// A vector of meta-data information about each look-behind in this NFA.
344+
///
345+
/// Must be stored in a depth-first pre-order with regards to the nesting
346+
/// of look-behinds.
347+
lookbehinds: Vec<LookBehindInfo>,
349348
/// A map from pattern ID to capture group index to name. (If no name
350349
/// exists, then a None entry is present. Thus, all capturing groups are
351350
/// present in this mapping.)
@@ -378,10 +377,7 @@ pub struct Builder {
378377
impl Builder {
379378
/// Create a new builder for hand-assembling NFAs.
380379
pub fn new() -> Builder {
381-
Builder {
382-
maximum_lookbehind_offset_from_start: Some(0),
383-
..Builder::default()
384-
}
380+
Builder::default()
385381
}
386382

387383
/// Clear this builder.
@@ -394,7 +390,7 @@ impl Builder {
394390
self.pattern_id = None;
395391
self.states.clear();
396392
self.start_pattern.clear();
397-
self.start_look_behind.clear();
393+
self.lookbehinds.clear();
398394
self.captures.clear();
399395
self.memory_states = 0;
400396
}
@@ -459,10 +455,7 @@ impl Builder {
459455
remap.resize(self.states.len(), StateID::ZERO);
460456

461457
nfa.set_starts(start_anchored, start_unanchored, &self.start_pattern);
462-
nfa.set_look_behind_starts(self.start_look_behind.as_slice());
463-
nfa.set_maximum_lookbehind_offset_from_start(
464-
self.maximum_lookbehind_offset_from_start,
465-
);
458+
nfa.set_lookbehinds(self.lookbehinds.as_slice());
466459
nfa.set_captures(&self.captures).map_err(BuildError::captures)?;
467460
// The idea here is to convert our intermediate states to their final
468461
// form. The only real complexity here is the process of converting
@@ -724,21 +717,16 @@ impl Builder {
724717
/// running look-behind expressions. Additionally registers the furthest
725718
/// offset (in bytes) from the start of the main regex this look-behind
726719
/// starts.
727-
pub fn start_look_behind(
720+
///
721+
/// Look-behinds must be started in a depth-first pre-order fashion with
722+
/// regards to the nesting of look-behinds.
723+
pub fn start_lookbehind(
728724
&mut self,
729725
start_id: StateID,
730726
offset_from_start: Option<usize>,
731727
) {
732-
self.start_look_behind.push(start_id);
733-
734-
self.maximum_lookbehind_offset_from_start = match (
735-
self.maximum_lookbehind_offset_from_start,
736-
offset_from_start,
737-
) {
738-
(Some(l1), Some(l2)) => Some(usize::max(l1, l2)),
739-
// A None subsumes the entire result.
740-
(None, _) | (_, None) => None,
741-
};
728+
self.lookbehinds
729+
.push(LookBehindInfo::new(start_id, offset_from_start));
742730
}
743731

744732
/// Add an "empty" NFA state.

regex-automata/src/nfa/thompson/compiler.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1060,7 +1060,7 @@ impl Compiler {
10601060
self.c_at_least(&Hir::dot(hir::Dot::AnyByte), false, 0)?;
10611061
self.builder
10621062
.borrow_mut()
1063-
.start_look_behind(unanchored.start, start_offset);
1063+
.start_lookbehind(unanchored.start, start_offset);
10641064

10651065
// When compiling the subexpression we temporarily change the starting
10661066
// offset and restore it after. This way, the subexpression is relativized

regex-automata/src/nfa/thompson/nfa.rs

Lines changed: 46 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -1108,15 +1108,8 @@ impl NFA {
11081108

11091109
/// Returns the starting states for initializing look-behind evaluation.
11101110
#[inline]
1111-
pub fn look_behind_starts(&self) -> &Vec<StateID> {
1112-
&self.0.start_look_behind
1113-
}
1114-
1115-
/// Returns the length (in bytes) of the longest string matched by any
1116-
/// look-behind sub-expression. If `None`, the length is unbounded.
1117-
#[inline]
1118-
pub fn maximum_lookbehind_offset_from_start(&self) -> Option<usize> {
1119-
self.0.maximum_lookbehind_offset_from_start
1111+
pub fn lookbehinds(&self) -> &Vec<LookBehindInfo> {
1112+
&self.0.lookbehinds
11201113
}
11211114

11221115
// FIXME: The `look_set_prefix_all` computation was not correct, and it
@@ -1283,19 +1276,48 @@ pub(super) struct Inner {
12831276
/// This is needed to initialize the table for storing the result of
12841277
/// look-around evaluation.
12851278
lookaround_count: usize,
1286-
/// Contains the start states for each of the look-behind subexpressions.
1287-
start_look_behind: Vec<StateID>,
1288-
/// Among all look-behinds, this is the furthest offset (in bytes) from
1289-
/// the beginning of the main regex that a look-behind starts at.
1290-
/// If `None`, the offset is unbounded.
1291-
maximum_lookbehind_offset_from_start: Option<usize>,
1279+
/// A vector of meta-data information about each look-behind in this NFA.
1280+
///
1281+
/// Must be stored in a depth-first pre-order with regards to the nesting
1282+
/// of look-behinds.
1283+
lookbehinds: Vec<LookBehindInfo>,
12921284
/// Heap memory used indirectly by NFA states and other things (like the
12931285
/// various capturing group representations above). Since each state
12941286
/// might use a different amount of heap, we need to keep track of this
12951287
/// incrementally.
12961288
memory_extra: usize,
12971289
}
12981290

1291+
/// Information about a look-behind needed for execution.
1292+
#[derive(Clone, Copy, Debug)]
1293+
pub struct LookBehindInfo {
1294+
/// The id of the start state of the look-behind subexpression.
1295+
start_id: StateID,
1296+
/// The offset (in bytes) from the beginning of the main regex that a
1297+
/// look-behind starts at. If `None`, the offset is unbounded.
1298+
offset_from_start: Option<usize>,
1299+
}
1300+
1301+
impl LookBehindInfo {
1302+
pub(super) fn new(
1303+
start_id: StateID,
1304+
offset_from_start: Option<usize>,
1305+
) -> Self {
1306+
Self { start_id, offset_from_start }
1307+
}
1308+
1309+
/// Start states of the look-behind subexpression.
1310+
pub(super) fn start_state(&self) -> StateID {
1311+
self.start_id
1312+
}
1313+
1314+
/// The offset (in bytes) from the beginning of the main regex that a
1315+
/// look-behind starts at. If `None`, the offset is unbounded.
1316+
pub(super) fn offset_from_start(&self) -> Option<usize> {
1317+
self.offset_from_start
1318+
}
1319+
}
1320+
12991321
impl Inner {
13001322
/// Runs any last finalization bits and turns this into a full NFA.
13011323
pub(super) fn into_nfa(mut self) -> NFA {
@@ -1438,19 +1460,12 @@ impl Inner {
14381460
self.start_pattern = start_pattern.to_vec();
14391461
}
14401462

1441-
pub(super) fn set_look_behind_starts(
1442-
&mut self,
1443-
look_behind_starts: &[StateID],
1444-
) {
1445-
self.start_look_behind = look_behind_starts.to_vec();
1446-
}
1447-
1448-
pub(super) fn set_maximum_lookbehind_offset_from_start(
1449-
&mut self,
1450-
maximum_lookbehind_offset_from_start: Option<usize>,
1451-
) {
1452-
self.maximum_lookbehind_offset_from_start =
1453-
maximum_lookbehind_offset_from_start;
1463+
/// Sets the look-behind information of this NFA.
1464+
///
1465+
/// The slice must be in a depth-first pre-order with regards to the
1466+
/// nesting of look-behinds.
1467+
pub(super) fn set_lookbehinds(&mut self, lookbehinds: &[LookBehindInfo]) {
1468+
self.lookbehinds = lookbehinds.to_vec();
14541469
}
14551470

14561471
/// Sets the UTF-8 mode of this NFA.
@@ -1506,7 +1521,8 @@ impl Inner {
15061521
for id in self.start_pattern.iter_mut() {
15071522
*id = old_to_new[*id];
15081523
}
1509-
for id in self.start_look_behind.iter_mut() {
1524+
for LookBehindInfo { start_id: id, .. } in self.lookbehinds.iter_mut()
1525+
{
15101526
*id = old_to_new[*id];
15111527
}
15121528
}
@@ -1520,7 +1536,7 @@ impl fmt::Debug for Inner {
15201536
'^'
15211537
} else if sid == self.start_unanchored {
15221538
'>'
1523-
} else if self.start_look_behind.contains(&sid) {
1539+
} else if self.lookbehinds.iter().any(|i| i.start_state() == sid) {
15241540
'<'
15251541
} else {
15261542
' '

regex-automata/src/nfa/thompson/pikevm.rs

Lines changed: 35 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -278,7 +278,24 @@ impl Builder {
278278
/// given here is already built.
279279
pub fn build_from_nfa(&self, nfa: NFA) -> Result<PikeVM, BuildError> {
280280
nfa.look_set_any().available().map_err(BuildError::word)?;
281-
Ok(PikeVM { config: self.config.clone(), nfa })
281+
282+
// The reverse of a depth-first pre-order is the depth-first
283+
// reverse-post-order. This means, a look-around is always before its
284+
// surrounding look-behinds in this vector.
285+
let lookbehind_starts =
286+
nfa.lookbehinds().iter().map(|i| i.start_state()).rev().collect();
287+
288+
let maximum_lookbehind_offset_from_start =
289+
nfa.lookbehinds().iter().try_fold(0, |acc, curr| {
290+
curr.offset_from_start().map(|o| usize::max(acc, o))
291+
});
292+
293+
Ok(PikeVM {
294+
config: self.config.clone(),
295+
nfa,
296+
lookbehind_starts,
297+
maximum_lookbehind_offset_from_start,
298+
})
282299
}
283300

284301
/// Apply the given `PikeVM` configuration options to this builder.
@@ -387,6 +404,13 @@ impl Builder {
387404
pub struct PikeVM {
388405
config: Config,
389406
nfa: NFA,
407+
/// Stored depth-first reverse-post-order with regards to the nesting
408+
/// of look-behinds.
409+
lookbehind_starts: Vec<StateID>,
410+
/// Among all look-behinds, this is the furthest offset (in bytes) from
411+
/// the beginning of the main regex that a look-behind starts at.
412+
/// If `None`, the offset is unbounded.
413+
maximum_lookbehind_offset_from_start: Option<usize>,
390414
}
391415

392416
impl PikeVM {
@@ -1258,9 +1282,6 @@ impl PikeVM {
12581282
Some(config) => config,
12591283
};
12601284

1261-
let maximum_lookbehind_offset_from_start =
1262-
self.nfa.maximum_lookbehind_offset_from_start();
1263-
12641285
let pre =
12651286
if anchored { None } else { self.get_config().get_prefilter() };
12661287
let Cache {
@@ -1281,18 +1302,17 @@ impl PikeVM {
12811302
// start from 0.
12821303
let start_position = usize::saturating_sub(
12831304
input.start(),
1284-
maximum_lookbehind_offset_from_start.unwrap_or(input.start()),
1305+
self.maximum_lookbehind_offset_from_start
1306+
.unwrap_or(input.start()),
12851307
);
12861308

12871309
// This initializes the look-behind threads from the `start_position`
12881310
// Note: since capture groups are not allowed inside look-behinds,
12891311
// there won't be any Capture epsilon transitions and hence it is ok to
1290-
// use &mut [] for the slots parameter. We need to add the start states
1291-
// in reverse because more deeply nested look-behinds have a higher index
1292-
// but must be executed first, so that the result is available for the
1293-
// outer expression.
1294-
for look_behind_start in self.nfa.look_behind_starts().iter().rev()
1295-
{
1312+
// use &mut [] for the slots parameter. Since the start states are stored
1313+
// in depth-first reverse-post-order, more deeply nested look-behinds are
1314+
// executed first, so that the result is available for the outer expression.
1315+
for look_behind_start in &self.lookbehind_starts {
12961316
self.epsilon_closure(
12971317
stack,
12981318
&mut [],
@@ -1363,19 +1383,16 @@ impl PikeVM {
13631383
at,
13641384
usize::saturating_sub(
13651385
span.start,
1366-
maximum_lookbehind_offset_from_start
1386+
self.maximum_lookbehind_offset_from_start
13671387
.unwrap_or(span.start),
13681388
),
13691389
);
13701390
// If we resume from later than `at`, we need
13711391
// to reinitialize the look-behind threads.
13721392
if start_position != at {
13731393
curr_lookaround.set.clear();
1374-
for look_behind_start in self
1375-
.nfa
1376-
.look_behind_starts()
1377-
.iter()
1378-
.rev()
1394+
for look_behind_start in
1395+
&self.lookbehind_starts
13791396
{
13801397
self.epsilon_closure(
13811398
stack,
@@ -1598,7 +1615,7 @@ impl PikeVM {
15981615
match_lookaround: _,
15991616
} = cache;
16001617

1601-
for look_behind_start in self.nfa.look_behind_starts().iter().rev() {
1618+
for look_behind_start in &self.lookbehind_starts {
16021619
self.epsilon_closure(
16031620
stack,
16041621
&mut [],

0 commit comments

Comments
 (0)