Skip to content

Commit 8f09510

Browse files
committed
syntax: tweak concat and alternation construction
We simplify construction a bit to prepare for bigger simplifications. We also fix a bug in 'Hir::alternation' where it would incorrectly return 'Hir::empty()' when given an empty alternation. That's correct for an empty concatenation, but an alternation with no branches is equivalent to an expression that never matches anything. To fix that, we create a new 'Hir::fail' that canonicalizes the HIR value used to indicate "impossible to match." Thankfully this bug was unlikely to be observed unless one was constructing HIR values manually. Namely, it is impossible to spell "empty alternation" in the concrete syntax of a regex.
1 parent be57a23 commit 8f09510

File tree

2 files changed

+49
-9
lines changed

2 files changed

+49
-9
lines changed

regex-syntax/src/hir/mod.rs

Lines changed: 48 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -236,6 +236,29 @@ impl Hir {
236236
Hir { kind: HirKind::Empty, props }
237237
}
238238

239+
/// Returns an HIR expression that can never match anything. That is, the
240+
/// set of strings in the language described by the HIR returned is `0`.
241+
///
242+
/// This is distinct from [`Hir::empty`] in that the empty string matches
243+
/// the HIR returned by `Hir::empty`. That is, the set of strings in the
244+
/// language describe described by `Hir::empty` is non-empty.
245+
///
246+
/// Note that currently, the HIR returned uses an empty character class to
247+
/// indicate that nothing can match. An equivalent expression that cannot
248+
/// match is an empty alternation, but all such "fail" expressions are
249+
/// normalized (via smart constructors) to empty character classes. This is
250+
/// because empty character classes can be spelled in the concrete syntax
251+
/// of a regex (e.g., `\P{any}` or `(?-u:[^\x00-\xFF])` or `[a&&b]`), but
252+
/// empty alternations cannot.
253+
pub fn fail() -> Hir {
254+
let class = Class::Bytes(ClassBytes::empty());
255+
let props = Properties::class(&class);
256+
// We can't just call Hir::class here because it defers to Hir::fail
257+
// in order to canonicalize the Hir value used to represent "cannot
258+
// match."
259+
Hir { kind: HirKind::Class(class), props }
260+
}
261+
239262
/// Creates a literal HIR expression.
240263
///
241264
/// If the given literal has a `Byte` variant with an ASCII byte, then this
@@ -254,7 +277,9 @@ impl Hir {
254277

255278
/// Creates a class HIR expression.
256279
pub fn class(class: Class) -> Hir {
257-
if let Some(bytes) = class.literal() {
280+
if class.is_empty() {
281+
return Hir::fail();
282+
} else if let Some(bytes) = class.literal() {
258283
return Hir::literal(bytes);
259284
}
260285
let props = Properties::class(&class);
@@ -293,20 +318,24 @@ impl Hir {
293318
///
294319
/// This flattens the concatenation as appropriate.
295320
pub fn concat(mut exprs: Vec<Hir>) -> Hir {
296-
match exprs.len() {
297-
0 => Hir::empty(),
298-
1 => exprs.pop().unwrap(),
299-
_ => {
300-
let props = Properties::concat(&exprs);
301-
Hir { kind: HirKind::Concat(exprs), props }
302-
}
321+
if exprs.is_empty() {
322+
return Hir::empty();
323+
} else if exprs.len() == 1 {
324+
return exprs.pop().unwrap();
303325
}
326+
let props = Properties::concat(&exprs);
327+
Hir { kind: HirKind::Concat(exprs), props }
304328
}
305329

306330
/// Returns the alternation of the given expressions.
307331
///
308332
/// This flattens the alternation as appropriate.
309333
pub fn alternation(mut exprs: Vec<Hir>) -> Hir {
334+
if exprs.is_empty() {
335+
return Hir::fail();
336+
} else if exprs.len() == 1 {
337+
return exprs.pop().unwrap();
338+
}
310339
match exprs.len() {
311340
0 => Hir::empty(),
312341
1 => exprs.pop().unwrap(),
@@ -538,6 +567,17 @@ impl Class {
538567
}
539568
}
540569

570+
/// Returns true if and only if this character class is empty. That is,
571+
/// it has no elements.
572+
///
573+
/// An empty character can never match anything, including an empty string.
574+
pub fn is_empty(&self) -> bool {
575+
match *self {
576+
Class::Unicode(ref x) => x.ranges().is_empty(),
577+
Class::Bytes(ref x) => x.ranges().is_empty(),
578+
}
579+
}
580+
541581
/// If this class consists of exactly one element (whether a codepoint or a
542582
/// byte), then return it as a literal byte string.
543583
///

regex-syntax/src/hir/translate.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2803,7 +2803,7 @@ mod tests {
28032803
fn class_bracketed_nested() {
28042804
assert_eq!(t(r"[a[^c]]"), class_negate(uclass(&[('c', 'c')])));
28052805
assert_eq!(t(r"[a-b[^c]]"), class_negate(uclass(&[('c', 'c')])));
2806-
assert_eq!(t(r"[a-c[^c]]"), hir_negate(hir_uclass(&[])));
2806+
assert_eq!(t(r"[a-c[^c]]"), class_negate(uclass(&[])));
28072807

28082808
assert_eq!(t(r"[^a[^c]]"), hir_uclass(&[('c', 'c')]));
28092809
assert_eq!(t(r"[^a-b[^c]]"), hir_uclass(&[('c', 'c')]));

0 commit comments

Comments
 (0)