Skip to content

Commit eb2eff9

Browse files
committed
syntax: flatten concatenations
This makes the Hir::concat constructor a bit smarter by combining adjacent literals and flattening child concatenations into the parent concatenation.
1 parent 2a372a3 commit eb2eff9

File tree

2 files changed

+97
-6
lines changed

2 files changed

+97
-6
lines changed

regex-syntax/src/hir/mod.rs

Lines changed: 70 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -221,6 +221,17 @@ impl Hir {
221221
pub fn properties(&self) -> &Properties {
222222
&self.props
223223
}
224+
225+
/// Splits this HIR into its constituent parts.
226+
///
227+
/// This is useful because `let Hir { kind, props } = hir;` does not work
228+
/// because of `Hir`'s custom `Drop` implementation.
229+
fn into_parts(mut self) -> (HirKind, Properties) {
230+
(
231+
core::mem::replace(&mut self.kind, HirKind::Empty),
232+
core::mem::replace(&mut self.props, Properties::empty()),
233+
)
234+
}
224235
}
225236

226237
/// Smart constructors for HIR values.
@@ -324,14 +335,67 @@ impl Hir {
324335
/// Returns the concatenation of the given expressions.
325336
///
326337
/// This flattens the concatenation as appropriate.
327-
pub fn concat(mut exprs: Vec<Hir>) -> Hir {
328-
if exprs.is_empty() {
338+
pub fn concat(hirs: Vec<Hir>) -> Hir {
339+
// We rebuild the concatenation by simplifying it. Would be nice to do
340+
// it in place, but that seems a little tricky?
341+
let mut new = vec![];
342+
// This gobbles up any adjacent literals in a concatenation and smushes
343+
// them together. Basically, when we see a literal, we add its bytes
344+
// to 'prior_lit', and whenever we see anything else, we first take
345+
// any bytes in 'prior_lit' and add it to the 'new' concatenation.
346+
let mut prior_lit: Option<Vec<u8>> = None;
347+
for hir in hirs {
348+
let (kind, props) = hir.into_parts();
349+
match kind {
350+
HirKind::Literal(Literal(bytes)) => {
351+
if let Some(ref mut prior_bytes) = prior_lit {
352+
prior_bytes.extend_from_slice(&bytes);
353+
} else {
354+
prior_lit = Some(bytes.to_vec());
355+
}
356+
}
357+
// We also flatten concats that are direct children of another
358+
// concat. We only need to do this one level deep since
359+
// Hir::concat is the only way to build concatenations, and so
360+
// flattening happens inductively.
361+
HirKind::Concat(hirs2) => {
362+
for hir2 in hirs2 {
363+
let (kind2, props2) = hir2.into_parts();
364+
match kind2 {
365+
HirKind::Literal(Literal(bytes)) => {
366+
if let Some(ref mut prior_bytes) = prior_lit {
367+
prior_bytes.extend_from_slice(&bytes);
368+
} else {
369+
prior_lit = Some(bytes.to_vec());
370+
}
371+
}
372+
kind2 => {
373+
if let Some(prior_bytes) = prior_lit.take() {
374+
new.push(Hir::literal(prior_bytes));
375+
}
376+
new.push(Hir { kind: kind2, props: props2 });
377+
}
378+
}
379+
}
380+
}
381+
kind => {
382+
if let Some(prior_bytes) = prior_lit.take() {
383+
new.push(Hir::literal(prior_bytes));
384+
}
385+
new.push(Hir { kind, props });
386+
}
387+
}
388+
}
389+
if let Some(prior_bytes) = prior_lit.take() {
390+
new.push(Hir::literal(prior_bytes));
391+
}
392+
if new.is_empty() {
329393
return Hir::empty();
330-
} else if exprs.len() == 1 {
331-
return exprs.pop().unwrap();
394+
} else if new.len() == 1 {
395+
return new.pop().unwrap();
332396
}
333-
let props = Properties::concat(&exprs);
334-
Hir { kind: HirKind::Concat(exprs), props }
397+
let props = Properties::concat(&new);
398+
Hir { kind: HirKind::Concat(new), props }
335399
}
336400

337401
/// Returns the alternation of the given expressions.

regex-syntax/src/hir/translate.rs

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3308,4 +3308,31 @@ mod tests {
33083308
assert!(!props(r"(a)|b").is_alternation_literal());
33093309
assert!(!props(r"a|(b)").is_alternation_literal());
33103310
}
3311+
3312+
// This tests that the smart Hir::concat constructor simplifies the given
3313+
// exprs in a way we expect.
3314+
#[test]
3315+
fn smart_concat() {
3316+
assert_eq!(t(""), Hir::empty());
3317+
assert_eq!(t("(?:)"), Hir::empty());
3318+
assert_eq!(t("abc"), hir_lit("abc"));
3319+
assert_eq!(t("(?:foo)(?:bar)"), hir_lit("foobar"));
3320+
assert_eq!(t("quux(?:foo)(?:bar)baz"), hir_lit("quuxfoobarbaz"));
3321+
assert_eq!(
3322+
t("foo(?:bar^baz)quux"),
3323+
hir_cat(vec![
3324+
hir_lit("foobar"),
3325+
hir_look(hir::Look::Start),
3326+
hir_lit("bazquux"),
3327+
])
3328+
);
3329+
assert_eq!(
3330+
t("foo(?:ba(?:r^b)az)quux"),
3331+
hir_cat(vec![
3332+
hir_lit("foobar"),
3333+
hir_look(hir::Look::Start),
3334+
hir_lit("bazquux"),
3335+
])
3336+
);
3337+
}
33113338
}

0 commit comments

Comments
 (0)