Skip to content

Commit 5d08243

Browse files
committed
syntax: remove non-capturing groups from HIR
It turns out they are completely superfluous in the HIR, so we can drop them completely. We only need to explicitly represent capturing groups.
1 parent 55976dc commit 5d08243

File tree

4 files changed

+63
-102
lines changed

4 files changed

+63
-102
lines changed

regex-syntax/src/hir/mod.rs

Lines changed: 13 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1118,39 +1118,25 @@ impl Look {
11181118
}
11191119
}
11201120

1121-
/// The high-level intermediate representation for a group.
1121+
/// The high-level intermediate representation for a capturing group.
11221122
///
1123-
/// This represents one of three possible group types:
1123+
/// A capturing group always has an index and a child expression. It may
1124+
/// also have a name associated with it (e.g., `(?P<foo>\w)`), but it's not
1125+
/// necessary.
11241126
///
1125-
/// 1. A non-capturing group (e.g., `(?:expr)`).
1126-
/// 2. A capturing group (e.g., `(expr)`).
1127-
/// 3. A named capturing group (e.g., `(?P<name>expr)`).
1127+
/// Note that there is no explicit representation of a non-capturing group
1128+
/// in a `Hir`. Instead, non-capturing grouping is handled automatically by
1129+
/// the recursive structure of the `Hir` itself.
11281130
#[derive(Clone, Debug, Eq, PartialEq)]
11291131
pub struct Group {
1130-
/// The kind of this group. If it is a capturing group, then the kind
1131-
/// contains the capture group index (and the name, if it is a named
1132-
/// group).
1133-
pub kind: GroupKind,
1132+
/// The capture index of the group.
1133+
pub index: u32,
1134+
/// The name of the group, if it exists.
1135+
pub name: Option<Box<str>>,
11341136
/// The expression inside the capturing group, which may be empty.
11351137
pub hir: Box<Hir>,
11361138
}
11371139

1138-
/// The kind of group.
1139-
#[derive(Clone, Debug, Eq, PartialEq)]
1140-
pub enum GroupKind {
1141-
/// A non-capturing group.
1142-
NonCapturing,
1143-
/// A capturing group with an optional name.
1144-
///
1145-
/// The value is the capture index of the group.
1146-
Capture {
1147-
/// The capture index of the group.
1148-
index: u32,
1149-
/// The name of the group, if it exists.
1150-
name: Option<String>,
1151-
},
1152-
}
1153-
11541140
/// The high-level intermediate representation of a repetition operator.
11551141
///
11561142
/// A repetition operator permits the repetition of an arbitrary
@@ -2452,7 +2438,8 @@ mod tests {
24522438
let mut expr = Hir::empty();
24532439
for _ in 0..100 {
24542440
expr = Hir::group(Group {
2455-
kind: GroupKind::NonCapturing,
2441+
index: 1,
2442+
name: None,
24562443
hir: Box::new(expr),
24572444
});
24582445
expr = Hir::repetition(Repetition {

regex-syntax/src/hir/print.rs

Lines changed: 7 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -176,17 +176,12 @@ impl<W: fmt::Write> Visitor for Writer<W> {
176176
self.wtr.write_str(r"\B")?;
177177
}
178178
},
179-
HirKind::Group(ref x) => match x.kind {
180-
hir::GroupKind::Capture { ref name, .. } => {
181-
self.wtr.write_str("(")?;
182-
if let Some(ref name) = *name {
183-
write!(self.wtr, "?P<{}>", name)?;
184-
}
185-
}
186-
hir::GroupKind::NonCapturing => {
187-
self.wtr.write_str("(?:")?;
179+
HirKind::Group(hir::Group { ref name, .. }) => {
180+
self.wtr.write_str("(")?;
181+
if let Some(ref name) = *name {
182+
write!(self.wtr, "?P<{}>", name)?;
188183
}
189-
},
184+
}
190185
// Why do this? Wrapping concats and alts in non-capturing groups
191186
// is not *always* necessary, but is sometimes necessary. For
192187
// example, 'concat(a, alt(b, c))' should be written as 'a(?:b|c)'
@@ -415,11 +410,11 @@ mod tests {
415410
fn print_group() {
416411
roundtrip("()", "()");
417412
roundtrip("(?P<foo>)", "(?P<foo>)");
418-
roundtrip("(?:)", "(?:)");
413+
roundtrip("(?:)", "");
419414

420415
roundtrip("(a)", "(a)");
421416
roundtrip("(?P<foo>a)", "(?P<foo>a)");
422-
roundtrip("(?:a)", "(?:a)");
417+
roundtrip("(?:a)", "a");
423418

424419
roundtrip("((((a))))", "((((a))))");
425420
}

regex-syntax/src/hir/translate.rs

Lines changed: 32 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -915,17 +915,16 @@ impl<'t, 'p> TranslatorI<'t, 'p> {
915915
}
916916

917917
fn hir_group(&self, group: &ast::Group, expr: Hir) -> Hir {
918-
let kind = match group.kind {
919-
ast::GroupKind::CaptureIndex(index) => {
920-
hir::GroupKind::Capture { index, name: None }
921-
}
922-
ast::GroupKind::CaptureName(ref cap) => hir::GroupKind::Capture {
923-
index: cap.index,
924-
name: Some(cap.name.clone()),
925-
},
926-
ast::GroupKind::NonCapturing(_) => hir::GroupKind::NonCapturing,
918+
let (index, name) = match group.kind {
919+
ast::GroupKind::CaptureIndex(index) => (index, None),
920+
ast::GroupKind::CaptureName(ref cap) => {
921+
(cap.index, Some(cap.name.clone().into_boxed_str()))
922+
}
923+
// The HIR doesn't need to use non-capturing groups, since the way
924+
// in which the data type is defined handles this automatically.
925+
ast::GroupKind::NonCapturing(_) => return expr,
927926
};
928-
Hir::group(hir::Group { kind, hir: Box::new(expr) })
927+
Hir::group(hir::Group { index, name, hir: Box::new(expr) })
929928
}
930929

931930
fn hir_repetition(&self, rep: &ast::Repetition, expr: Hir) -> Hir {
@@ -1349,26 +1348,14 @@ mod tests {
13491348
Hir::literal(s)
13501349
}
13511350

1352-
fn hir_group(i: u32, expr: Hir) -> Hir {
1353-
Hir::group(hir::Group {
1354-
kind: hir::GroupKind::Capture { index: i, name: None },
1355-
hir: Box::new(expr),
1356-
})
1351+
fn hir_group(index: u32, expr: Hir) -> Hir {
1352+
Hir::group(hir::Group { index, name: None, hir: Box::new(expr) })
13571353
}
13581354

1359-
fn hir_group_name(i: u32, name: &str, expr: Hir) -> Hir {
1355+
fn hir_group_name(index: u32, name: &str, expr: Hir) -> Hir {
13601356
Hir::group(hir::Group {
1361-
kind: hir::GroupKind::Capture {
1362-
index: i,
1363-
name: Some(name.to_string()),
1364-
},
1365-
hir: Box::new(expr),
1366-
})
1367-
}
1368-
1369-
fn hir_group_nocap(expr: Hir) -> Hir {
1370-
Hir::group(hir::Group {
1371-
kind: hir::GroupKind::NonCapturing,
1357+
index,
1358+
name: Some(name.into()),
13721359
hir: Box::new(expr),
13731360
})
13741361
}
@@ -1519,7 +1506,7 @@ mod tests {
15191506
assert_eq!(t(""), Hir::empty());
15201507
assert_eq!(t("(?i)"), Hir::empty());
15211508
assert_eq!(t("()"), hir_group(1, Hir::empty()));
1522-
assert_eq!(t("(?:)"), hir_group_nocap(Hir::empty()));
1509+
assert_eq!(t("(?:)"), Hir::empty());
15231510
assert_eq!(t("(?P<wat>)"), hir_group_name(1, "wat", Hir::empty()));
15241511
assert_eq!(t("|"), hir_alt(vec![Hir::empty(), Hir::empty()]));
15251512
assert_eq!(
@@ -1592,10 +1579,7 @@ mod tests {
15921579
#[cfg(feature = "unicode-case")]
15931580
assert_eq!(t("(?i)a"), hir_uclass(&[('A', 'A'), ('a', 'a'),]));
15941581
#[cfg(feature = "unicode-case")]
1595-
assert_eq!(
1596-
t("(?i:a)"),
1597-
hir_group_nocap(hir_uclass(&[('A', 'A'), ('a', 'a')],))
1598-
);
1582+
assert_eq!(t("(?i:a)"), hir_uclass(&[('A', 'A'), ('a', 'a')]));
15991583
#[cfg(feature = "unicode-case")]
16001584
assert_eq!(
16011585
t("a(?i)a(?-i)a"),
@@ -1757,20 +1741,17 @@ mod tests {
17571741
hir_group_name(2, "bar", hir_lit("b")),
17581742
])
17591743
);
1760-
assert_eq!(t("(?:)"), hir_group_nocap(Hir::empty()));
1761-
assert_eq!(t("(?:a)"), hir_group_nocap(hir_lit("a")));
1744+
assert_eq!(t("(?:)"), Hir::empty());
1745+
assert_eq!(t("(?:a)"), hir_lit("a"));
17621746
assert_eq!(
17631747
t("(?:a)(b)"),
1764-
hir_cat(vec![
1765-
hir_group_nocap(hir_lit("a")),
1766-
hir_group(1, hir_lit("b")),
1767-
])
1748+
hir_cat(vec![hir_lit("a"), hir_group(1, hir_lit("b")),])
17681749
);
17691750
assert_eq!(
17701751
t("(a)(?:b)(c)"),
17711752
hir_cat(vec![
17721753
hir_group(1, hir_lit("a")),
1773-
hir_group_nocap(hir_lit("b")),
1754+
hir_lit("b"),
17741755
hir_group(2, hir_lit("c")),
17751756
])
17761757
);
@@ -1793,22 +1774,21 @@ mod tests {
17931774
#[cfg(feature = "unicode-case")]
17941775
assert_eq!(
17951776
t("(?i:a)a"),
1796-
hir_cat(vec![
1797-
hir_group_nocap(hir_uclass(&[('A', 'A'), ('a', 'a')])),
1798-
hir_lit("a"),
1799-
])
1777+
hir_cat(
1778+
vec![hir_uclass(&[('A', 'A'), ('a', 'a')]), hir_lit("a"),]
1779+
)
18001780
);
18011781
assert_eq!(
18021782
t("(?i-u:a)β"),
18031783
hir_cat(vec![
1804-
hir_group_nocap(hir_bclass(&[(b'A', b'A'), (b'a', b'a')])),
1784+
hir_bclass(&[(b'A', b'A'), (b'a', b'a')]),
18051785
hir_lit("β"),
18061786
])
18071787
);
18081788
assert_eq!(
18091789
t("(?:(?i-u)a)b"),
18101790
hir_cat(vec![
1811-
hir_group_nocap(hir_bclass(&[(b'A', b'A'), (b'a', b'a')])),
1791+
hir_bclass(&[(b'A', b'A'), (b'a', b'a')]),
18121792
hir_lit("b"),
18131793
])
18141794
);
@@ -1822,10 +1802,9 @@ mod tests {
18221802
#[cfg(feature = "unicode-case")]
18231803
assert_eq!(
18241804
t("(?i)(?-i:a)a"),
1825-
hir_cat(vec![
1826-
hir_group_nocap(hir_lit("a")),
1827-
hir_uclass(&[('A', 'A'), ('a', 'a')]),
1828-
])
1805+
hir_cat(
1806+
vec![hir_lit("a"), hir_uclass(&[('A', 'A'), ('a', 'a')]),]
1807+
)
18291808
);
18301809
#[cfg(feature = "unicode-case")]
18311810
assert_eq!(
@@ -1858,21 +1837,21 @@ mod tests {
18581837
assert_eq!(
18591838
t("(?:a(?i)a)a"),
18601839
hir_cat(vec![
1861-
hir_group_nocap(hir_cat(vec![
1840+
hir_cat(vec![
18621841
hir_lit("a"),
18631842
hir_uclass(&[('A', 'A'), ('a', 'a')]),
1864-
])),
1843+
]),
18651844
hir_lit("a"),
18661845
])
18671846
);
18681847
#[cfg(feature = "unicode-case")]
18691848
assert_eq!(
18701849
t("(?i)(?:a(?-i)a)a"),
18711850
hir_cat(vec![
1872-
hir_group_nocap(hir_cat(vec![
1851+
hir_cat(vec![
18731852
hir_uclass(&[('A', 'A'), ('a', 'a')]),
18741853
hir_lit("a"),
1875-
])),
1854+
]),
18761855
hir_uclass(&[('A', 'A'), ('a', 'a')]),
18771856
])
18781857
);

src/compile.rs

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -368,19 +368,19 @@ impl Compiler {
368368
self.c_empty_look(prog::EmptyLook::NotWordBoundary)
369369
}
370370
},
371-
Group(ref g) => match g.kind {
372-
hir::GroupKind::NonCapturing => self.c(&g.hir),
373-
hir::GroupKind::Capture { index, ref name } => {
374-
if index as usize >= self.compiled.captures.len() {
375-
self.compiled.captures.push(name.clone());
376-
if let Some(ref name) = *name {
377-
self.capture_name_idx
378-
.insert(name.clone(), index as usize);
379-
}
371+
Group(hir::Group { index, ref name, ref hir }) => {
372+
if index as usize >= self.compiled.captures.len() {
373+
let name = match *name {
374+
None => None,
375+
Some(ref boxed_str) => Some(boxed_str.to_string()),
376+
};
377+
self.compiled.captures.push(name.clone());
378+
if let Some(name) = name {
379+
self.capture_name_idx.insert(name, index as usize);
380380
}
381-
self.c_capture(2 * index as usize, &g.hir)
382381
}
383-
},
382+
self.c_capture(2 * index as usize, hir)
383+
}
384384
Concat(ref es) => {
385385
if self.compiled.is_reverse {
386386
self.c_concat(es.iter().rev())

0 commit comments

Comments
 (0)