Skip to content

Commit 224dc3a

Browse files
committed
syntax: small HIR simplifications
This makes it so 'a{1}' is rewritten as 'a' and '[a]' is rewritten as 'a'. A lot of the tests expected '[a]' to get preserved as a class in the HIR, so this required a bit of surgery.
1 parent 5d08243 commit 224dc3a

File tree

3 files changed

+124
-36
lines changed

3 files changed

+124
-36
lines changed

regex-syntax/src/hir/mod.rs

Lines changed: 56 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -122,16 +122,16 @@ impl core::fmt::Display for ErrorKind {
122122

123123
/// A high-level intermediate representation (HIR) for a regular expression.
124124
///
125-
/// The HIR of a regular expression represents an intermediate step between its
126-
/// abstract syntax (a structured description of the concrete syntax) and
127-
/// compiled byte codes. The purpose of HIR is to make regular expressions
125+
/// The HIR of a regular expression represents an intermediate step between
126+
/// its abstract syntax (a structured description of the concrete syntax) and
127+
/// an actual regex matcher. The purpose of HIR is to make regular expressions
128128
/// easier to analyze. In particular, the AST is much more complex than the
129129
/// HIR. For example, while an AST supports arbitrarily nested character
130130
/// classes, the HIR will flatten all nested classes into a single set. The HIR
131131
/// will also "compile away" every flag present in the concrete syntax. For
132132
/// example, users of HIR expressions never need to worry about case folding;
133-
/// it is handled automatically by the translator (e.g., by translating `(?i)A`
134-
/// to `[aA]`).
133+
/// it is handled automatically by the translator (e.g., by translating
134+
/// `(?i:A)` to `[aA]`).
135135
///
136136
/// If the HIR was produced by a translator that disallows invalid UTF-8, then
137137
/// the HIR is guaranteed to match UTF-8 exclusively.
@@ -150,11 +150,13 @@ impl core::fmt::Display for ErrorKind {
150150
/// 2. Every HIR expression contains attributes that are defined inductively,
151151
/// and can be computed cheaply during the construction process. For
152152
/// example, one such attribute is whether the expression must match at the
153-
/// beginning of the text.
153+
/// beginning of the haystack.
154154
///
155155
/// Also, an `Hir`'s `fmt::Display` implementation prints an HIR as a regular
156156
/// expression pattern string, and uses constant stack space and heap space
157-
/// proportional to the size of the `Hir`.
157+
/// proportional to the size of the `Hir`. The regex it prints is guaranteed to
158+
/// be _semantically_ equivalent to the original concrete syntax, but it may
159+
/// look very different. (And potentially not practically readable by a human.)
158160
#[derive(Clone, Debug, Eq, PartialEq)]
159161
pub struct Hir {
160162
/// The underlying HIR kind.
@@ -252,6 +254,9 @@ impl Hir {
252254

253255
/// Creates a class HIR expression.
254256
pub fn class(class: Class) -> Hir {
257+
if let Some(bytes) = class.literal() {
258+
return Hir::literal(bytes);
259+
}
255260
let props = Properties::class(&class);
256261
Hir { kind: HirKind::Class(class), props }
257262
}
@@ -267,8 +272,12 @@ impl Hir {
267272
// The regex 'a{0}' is always equivalent to the empty regex. This is
268273
// true even when 'a' is an expression that never matches anything
269274
// (like '\P{any}').
275+
//
276+
// Additionally, the regex 'a{1}' is always equivalent to 'a'.
270277
if rep.min == 0 && rep.max == Some(0) {
271278
return Hir::empty();
279+
} else if rep.min == 1 && rep.max == Some(1) {
280+
return *rep.hir;
272281
}
273282
let props = Properties::repetition(&rep);
274283
Hir { kind: HirKind::Repetition(rep), props }
@@ -541,6 +550,18 @@ impl Class {
541550
Class::Bytes(ref x) => x.maximum_len(),
542551
}
543552
}
553+
554+
/// If this class consists of exactly one element (whether a codepoint or a
555+
/// byte), then return it as a literal byte string.
556+
///
557+
/// If this class is empty or contains more than one element, then `None`
558+
/// is returned.
559+
pub fn literal(&self) -> Option<Vec<u8>> {
560+
match *self {
561+
Class::Unicode(ref x) => x.literal(),
562+
Class::Bytes(ref x) => x.literal(),
563+
}
564+
}
544565
}
545566

546567
/// A set of characters represented by Unicode scalar values.
@@ -680,6 +701,20 @@ impl ClassUnicode {
680701
// Correct because c1 < c2 implies c1.len_utf8() < c2.len_utf8().
681702
Some(last.end.len_utf8())
682703
}
704+
705+
/// If this class consists of exactly one codepoint, then return it as
706+
/// a literal byte string.
707+
///
708+
/// If this class is empty or contains more than one codepoint, then `None`
709+
/// is returned.
710+
pub fn literal(&self) -> Option<Vec<u8>> {
711+
let rs = self.ranges();
712+
if rs.len() == 1 && rs[0].start == rs[0].end {
713+
Some(rs[0].start.encode_utf8(&mut [0; 4]).to_string().into_bytes())
714+
} else {
715+
None
716+
}
717+
}
683718
}
684719

685720
/// An iterator over all ranges in a Unicode character class.
@@ -932,6 +967,20 @@ impl ClassBytes {
932967
Some(1)
933968
}
934969
}
970+
971+
/// If this class consists of exactly one byte, then return it as
972+
/// a literal byte string.
973+
///
974+
/// If this class is empty or contains more than one byte, then `None`
975+
/// is returned.
976+
pub fn literal(&self) -> Option<Vec<u8>> {
977+
let rs = self.ranges();
978+
if rs.len() == 1 && rs[0].start == rs[0].end {
979+
Some(vec![rs[0].start])
980+
} else {
981+
None
982+
}
983+
}
935984
}
936985

937986
/// An iterator over all ranges in a byte character class.

regex-syntax/src/hir/print.rs

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,11 @@ impl<W: fmt::Write> Visitor for Writer<W> {
129129
for range in cls.iter() {
130130
if range.start() == range.end() {
131131
self.write_literal_char(range.start())?;
132+
} else if u32::from(range.start()) + 1
133+
== u32::from(range.end())
134+
{
135+
self.write_literal_char(range.start())?;
136+
self.write_literal_char(range.end())?;
132137
} else {
133138
self.write_literal_char(range.start())?;
134139
self.wtr.write_str("-")?;
@@ -142,6 +147,9 @@ impl<W: fmt::Write> Visitor for Writer<W> {
142147
for range in cls.iter() {
143148
if range.start() == range.end() {
144149
self.write_literal_class_byte(range.start())?;
150+
} else if range.start() + 1 == range.end() {
151+
self.write_literal_class_byte(range.start())?;
152+
self.write_literal_class_byte(range.end())?;
145153
} else {
146154
self.write_literal_class_byte(range.start())?;
147155
self.wtr.write_str("-")?;
@@ -327,26 +335,28 @@ mod tests {
327335

328336
#[test]
329337
fn print_class() {
330-
roundtrip(r"[a]", r"[a]");
338+
roundtrip(r"[a]", r"a");
339+
roundtrip(r"[ab]", r"[ab]");
331340
roundtrip(r"[a-z]", r"[a-z]");
332341
roundtrip(r"[a-z--b-c--x-y]", r"[ad-wz]");
333-
roundtrip(r"[^\x01-\u{10FFFF}]", "[\u{0}]");
334-
roundtrip(r"[-]", r"[\-]");
342+
roundtrip(r"[^\x01-\u{10FFFF}]", "\u{0}");
343+
roundtrip(r"[-]", r"\-");
335344
roundtrip(r"[☃-⛄]", r"[☃-⛄]");
336345

337-
roundtrip(r"(?-u)[a]", r"(?-u:[a])");
346+
roundtrip(r"(?-u)[a]", r"a");
347+
roundtrip(r"(?-u)[ab]", r"(?-u:[ab])");
338348
roundtrip(r"(?-u)[a-z]", r"(?-u:[a-z])");
339349
roundtrip_bytes(r"(?-u)[a-\xFF]", r"(?-u:[a-\xFF])");
340350

341351
// The following test that the printer escapes meta characters
342352
// in character classes.
343-
roundtrip(r"[\[]", r"[\[]");
353+
roundtrip(r"[\[]", r"\[");
344354
roundtrip(r"[Z-_]", r"[Z-_]");
345355
roundtrip(r"[Z-_--Z]", r"[\[-_]");
346356

347357
// The following test that the printer escapes meta characters
348358
// in byte oriented character classes.
349-
roundtrip_bytes(r"(?-u)[\[]", r"(?-u:[\[])");
359+
roundtrip_bytes(r"(?-u)[\[]", r"\[");
350360
roundtrip_bytes(r"(?-u)[Z-_]", r"(?-u:[Z-_])");
351361
roundtrip_bytes(r"(?-u)[Z-_--Z]", r"(?-u:[\[-_])");
352362
}

regex-syntax/src/hir/translate.rs

Lines changed: 52 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1428,19 +1428,11 @@ mod tests {
14281428
}
14291429

14301430
fn hir_uclass(ranges: &[(char, char)]) -> Hir {
1431-
let ranges: Vec<hir::ClassUnicodeRange> = ranges
1432-
.iter()
1433-
.map(|&(s, e)| hir::ClassUnicodeRange::new(s, e))
1434-
.collect();
1435-
Hir::class(hir::Class::Unicode(hir::ClassUnicode::new(ranges)))
1431+
Hir::class(uclass(ranges))
14361432
}
14371433

14381434
fn hir_bclass(ranges: &[(u8, u8)]) -> Hir {
1439-
let ranges: Vec<hir::ClassBytesRange> = ranges
1440-
.iter()
1441-
.map(|&(s, e)| hir::ClassBytesRange::new(s, e))
1442-
.collect();
1443-
Hir::class(hir::Class::Bytes(hir::ClassBytes::new(ranges)))
1435+
Hir::class(bclass(ranges))
14441436
}
14451437

14461438
fn hir_case_fold(expr: Hir) -> Hir {
@@ -1463,6 +1455,33 @@ mod tests {
14631455
}
14641456
}
14651457

1458+
fn uclass(ranges: &[(char, char)]) -> hir::Class {
1459+
let ranges: Vec<hir::ClassUnicodeRange> = ranges
1460+
.iter()
1461+
.map(|&(s, e)| hir::ClassUnicodeRange::new(s, e))
1462+
.collect();
1463+
hir::Class::Unicode(hir::ClassUnicode::new(ranges))
1464+
}
1465+
1466+
fn bclass(ranges: &[(u8, u8)]) -> hir::Class {
1467+
let ranges: Vec<hir::ClassBytesRange> = ranges
1468+
.iter()
1469+
.map(|&(s, e)| hir::ClassBytesRange::new(s, e))
1470+
.collect();
1471+
hir::Class::Bytes(hir::ClassBytes::new(ranges))
1472+
}
1473+
1474+
#[cfg(feature = "unicode-case")]
1475+
fn class_case_fold(mut cls: hir::Class) -> Hir {
1476+
cls.case_fold_simple();
1477+
Hir::class(cls)
1478+
}
1479+
1480+
fn class_negate(mut cls: hir::Class) -> Hir {
1481+
cls.negate();
1482+
Hir::class(cls)
1483+
}
1484+
14661485
#[allow(dead_code)]
14671486
fn hir_union(expr1: Hir, expr2: Hir) -> Hir {
14681487
use crate::hir::Class::{Bytes, Unicode};
@@ -2522,8 +2541,9 @@ mod tests {
25222541

25232542
#[test]
25242543
fn class_bracketed() {
2525-
assert_eq!(t("[a]"), hir_uclass(&[('a', 'a')]));
2526-
assert_eq!(t("[^[a]]"), hir_negate(hir_uclass(&[('a', 'a')])));
2544+
assert_eq!(t("[a]"), hir_lit("a"));
2545+
assert_eq!(t("[ab]"), hir_uclass(&[('a', 'b')]));
2546+
assert_eq!(t("[^[a]]"), class_negate(uclass(&[('a', 'a')])));
25272547
assert_eq!(t("[a-z]"), hir_uclass(&[('a', 'z')]));
25282548
assert_eq!(t("[a-fd-h]"), hir_uclass(&[('a', 'h')]));
25292549
assert_eq!(t("[a-fg-m]"), hir_uclass(&[('a', 'm')]));
@@ -2586,11 +2606,11 @@ mod tests {
25862606
);
25872607
assert_eq!(t("(?i-u)[k]"), hir_bclass(&[(b'K', b'K'), (b'k', b'k'),]));
25882608

2589-
assert_eq!(t("[^a]"), hir_negate(hir_uclass(&[('a', 'a')])));
2590-
assert_eq!(t(r"[^\x00]"), hir_negate(hir_uclass(&[('\0', '\0')])));
2609+
assert_eq!(t("[^a]"), class_negate(uclass(&[('a', 'a')])));
2610+
assert_eq!(t(r"[^\x00]"), class_negate(uclass(&[('\0', '\0')])));
25912611
assert_eq!(
25922612
t_bytes("(?-u)[^a]"),
2593-
hir_negate(hir_bclass(&[(b'a', b'a')]))
2613+
class_negate(bclass(&[(b'a', b'a')]))
25942614
);
25952615
#[cfg(any(feature = "unicode-perl", feature = "unicode-gencat"))]
25962616
assert_eq!(
@@ -2778,8 +2798,8 @@ mod tests {
27782798

27792799
#[test]
27802800
fn class_bracketed_nested() {
2781-
assert_eq!(t(r"[a[^c]]"), hir_negate(hir_uclass(&[('c', 'c')])));
2782-
assert_eq!(t(r"[a-b[^c]]"), hir_negate(hir_uclass(&[('c', 'c')])));
2801+
assert_eq!(t(r"[a[^c]]"), class_negate(uclass(&[('c', 'c')])));
2802+
assert_eq!(t(r"[a-b[^c]]"), class_negate(uclass(&[('c', 'c')])));
27832803
assert_eq!(t(r"[a-c[^c]]"), hir_negate(hir_uclass(&[])));
27842804

27852805
assert_eq!(t(r"[^a[^c]]"), hir_uclass(&[('c', 'c')]));
@@ -2788,12 +2808,12 @@ mod tests {
27882808
#[cfg(feature = "unicode-case")]
27892809
assert_eq!(
27902810
t(r"(?i)[a[^c]]"),
2791-
hir_negate(hir_case_fold(hir_uclass(&[('c', 'c')])))
2811+
hir_negate(class_case_fold(uclass(&[('c', 'c')])))
27922812
);
27932813
#[cfg(feature = "unicode-case")]
27942814
assert_eq!(
27952815
t(r"(?i)[a-b[^c]]"),
2796-
hir_negate(hir_case_fold(hir_uclass(&[('c', 'c')])))
2816+
hir_negate(class_case_fold(uclass(&[('c', 'c')])))
27972817
);
27982818

27992819
#[cfg(feature = "unicode-case")]
@@ -3239,6 +3259,10 @@ mod tests {
32393259
assert!(props(r"ab").is_literal());
32403260
assert!(props(r"abc").is_literal());
32413261
assert!(props(r"(?m)abc").is_literal());
3262+
assert!(props(r"(?:a)").is_literal());
3263+
assert!(props(r"foo(?:a)").is_literal());
3264+
assert!(props(r"(?:a)foo").is_literal());
3265+
assert!(props(r"[a]").is_literal());
32423266

32433267
// Negative examples.
32443268
assert!(!props(r"").is_literal());
@@ -3248,7 +3272,7 @@ mod tests {
32483272
assert!(!props(r"a+").is_literal());
32493273
assert!(!props(r"foo(a)").is_literal());
32503274
assert!(!props(r"(a)foo").is_literal());
3251-
assert!(!props(r"[a]").is_literal());
3275+
assert!(!props(r"[ab]").is_literal());
32523276
}
32533277

32543278
#[test]
@@ -3262,6 +3286,11 @@ mod tests {
32623286
assert!(props(r"a|b|c").is_alternation_literal());
32633287
assert!(props(r"foo|bar").is_alternation_literal());
32643288
assert!(props(r"foo|bar|baz").is_alternation_literal());
3289+
assert!(props(r"[a]").is_alternation_literal());
3290+
assert!(props(r"[a]|b").is_alternation_literal());
3291+
assert!(props(r"a|[b]").is_alternation_literal());
3292+
assert!(props(r"(?:a)|b").is_alternation_literal());
3293+
assert!(props(r"a|(?:b)").is_alternation_literal());
32653294

32663295
// Negative examples.
32673296
assert!(!props(r"").is_alternation_literal());
@@ -3270,9 +3299,9 @@ mod tests {
32703299
assert!(!props(r"a+").is_alternation_literal());
32713300
assert!(!props(r"foo(a)").is_alternation_literal());
32723301
assert!(!props(r"(a)foo").is_alternation_literal());
3273-
assert!(!props(r"[a]").is_alternation_literal());
3274-
assert!(!props(r"[a]|b").is_alternation_literal());
3275-
assert!(!props(r"a|[b]").is_alternation_literal());
3302+
assert!(!props(r"[ab]").is_alternation_literal());
3303+
assert!(!props(r"[ab]|b").is_alternation_literal());
3304+
assert!(!props(r"a|[ab]").is_alternation_literal());
32763305
assert!(!props(r"(a)|b").is_alternation_literal());
32773306
assert!(!props(r"a|(b)").is_alternation_literal());
32783307
}

0 commit comments

Comments
 (0)