Skip to content

Commit 0e95c01

Browse files
committed
syntax: switch to 'Vec<u8>' to represent literals
This gets rid of the old 'Literal' type: enum Literal { Unicode(char), Byte(u8), } and replaces it with struct Literal(Box<[u8]>); I did this primarily because I perceive the new version to be a bit simpler and is very likely to be more space efficient given some of the changes I have in mind (upcoming in subsequent commits). Namely, I want to include more analysis information beyond just simply booleans, and this means using up more space. Putting that analysis information on every single byte/char seems gratuitous. But putting it on every single sequence of byte/chars seems more justifiable. I also have a hand-wavy idea that this might make analysis a bit easier. And another hand-wavy idea that debug-printing such an HIR will make it a bit more comprehensible. Overall, this isn't a completely obvious win and I do wonder whether I'll regret this. For one thing, the translator is now a fair bit more complicated in exchange for not creating a 'Vec<u8>' for every 'ast::Literal' node. This also gives up the Unicode vs byte distinct and just commits to "all bytes." Instead, we do a UTF-8 check on every 'Hir::literal' call, and that in turn sets the UTF-8 property. This does seem a bit wasteful, and indeed, we do another UTF-8 check in the compiler (even though we could use 'unsafe' correctly and avoid it). However, once the new NFA compiler lands from regex-automata, it operates purely in byte-land and will not need to do another UTF-8 check. Moreover, a UTF-8 check, even on every literal, is likely barely measureable in the grand scheme of things. I do also worry that this is overwrought. In particular, the AST creates a node for each character. Then the HIR smooths them out to sequences of characters (that is, Vec<u8>). And then NFA compilation splits them back out into states where a state handles at most one character (or range of characters). But, I am taking somewhat of a leap-of-judgment here that this will make analysis easier and will overall use less space. But we'll see.
1 parent 9d9b0e8 commit 0e95c01

File tree

7 files changed

+380
-187
lines changed

7 files changed

+380
-187
lines changed

regex-syntax/src/hir/literal/mod.rs

Lines changed: 6 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -582,12 +582,8 @@ impl Literals {
582582

583583
fn prefixes(expr: &Hir, lits: &mut Literals) {
584584
match *expr.kind() {
585-
HirKind::Literal(hir::Literal::Unicode(c)) => {
586-
let mut buf = [0; 4];
587-
lits.cross_add(c.encode_utf8(&mut buf).as_bytes());
588-
}
589-
HirKind::Literal(hir::Literal::Byte(b)) => {
590-
lits.cross_add(&[b]);
585+
HirKind::Literal(hir::Literal(ref bytes)) => {
586+
lits.cross_add(bytes);
591587
}
592588
HirKind::Class(hir::Class::Unicode(ref cls)) => {
593589
if !lits.add_char_class(cls) {
@@ -648,15 +644,10 @@ fn prefixes(expr: &Hir, lits: &mut Literals) {
648644

649645
fn suffixes(expr: &Hir, lits: &mut Literals) {
650646
match *expr.kind() {
651-
HirKind::Literal(hir::Literal::Unicode(c)) => {
652-
let mut buf = [0u8; 4];
653-
let i = c.encode_utf8(&mut buf).len();
654-
let buf = &mut buf[..i];
655-
buf.reverse();
656-
lits.cross_add(buf);
657-
}
658-
HirKind::Literal(hir::Literal::Byte(b)) => {
659-
lits.cross_add(&[b]);
647+
HirKind::Literal(hir::Literal(ref bytes)) => {
648+
let mut bytes = bytes.to_vec();
649+
bytes.reverse();
650+
lits.cross_add(&bytes);
660651
}
661652
HirKind::Class(hir::Class::Unicode(ref cls)) => {
662653
if !lits.add_char_class_reverse(cls) {

regex-syntax/src/hir/mod.rs

Lines changed: 13 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -169,7 +169,7 @@ pub enum HirKind {
169169
/// The empty regular expression, which matches everything, including the
170170
/// empty string.
171171
Empty,
172-
/// A single literal character that matches exactly this character.
172+
/// A literalstring that matches exactly these bytes.
173173
Literal(Literal),
174174
/// A single character class that matches any of the characters in the
175175
/// class. A class can either consist of Unicode scalar values as
@@ -231,13 +231,14 @@ impl Hir {
231231
/// If the given literal has a `Byte` variant with an ASCII byte, then this
232232
/// method panics. This enforces the invariant that `Byte` variants are
233233
/// only used to express matching of invalid UTF-8.
234-
pub fn literal(lit: Literal) -> Hir {
235-
if let Literal::Byte(b) = lit {
236-
assert!(b > 0x7F);
234+
pub fn literal<B: Into<Box<[u8]>>>(lit: B) -> Hir {
235+
let bytes = lit.into();
236+
if bytes.is_empty() {
237+
return Hir::empty();
237238
}
238239

239240
let mut info = HirInfo::new();
240-
info.set_always_utf8(lit.is_unicode());
241+
info.set_always_utf8(core::str::from_utf8(&bytes).is_ok());
241242
info.set_all_assertions(false);
242243
info.set_anchored_start(false);
243244
info.set_anchored_end(false);
@@ -248,7 +249,7 @@ impl Hir {
248249
info.set_match_empty(false);
249250
info.set_literal(true);
250251
info.set_alternation_literal(true);
251-
Hir { kind: HirKind::Literal(lit), info }
252+
Hir { kind: HirKind::Literal(Literal(bytes)), info }
252253
}
253254

254255
/// Creates a class HIR expression.
@@ -710,24 +711,7 @@ impl core::fmt::Display for Hir {
710711
/// are preferred whenever possible. In particular, a `Byte` variant is only
711712
/// ever produced when it could match invalid UTF-8.
712713
#[derive(Clone, Debug, Eq, PartialEq)]
713-
pub enum Literal {
714-
/// A single character represented by a Unicode scalar value.
715-
Unicode(char),
716-
/// A single character represented by an arbitrary byte.
717-
Byte(u8),
718-
}
719-
720-
impl Literal {
721-
/// Returns true if and only if this literal corresponds to a Unicode
722-
/// scalar value.
723-
pub fn is_unicode(&self) -> bool {
724-
match *self {
725-
Literal::Unicode(_) => true,
726-
Literal::Byte(b) if b <= 0x7F => true,
727-
Literal::Byte(_) => false,
728-
}
729-
}
730-
}
714+
pub struct Literal(pub Box<[u8]>);
731715

732716
/// The high-level intermediate representation of a character class.
733717
///
@@ -739,12 +723,11 @@ impl Literal {
739723
/// A character class, regardless of its character type, is represented by a
740724
/// sequence of non-overlapping non-adjacent ranges of characters.
741725
///
742-
/// Note that unlike [`Literal`], a `Bytes` variant may be produced even when
743-
/// it exclusively matches valid UTF-8. This is because a `Bytes` variant
744-
/// represents an intention by the author of the regular expression to disable
745-
/// Unicode mode, which in turn impacts the semantics of case insensitive
746-
/// matching. For example, `(?i)k` and `(?i-u)k` will not match the same set of
747-
/// strings.
726+
/// Note that `Bytes` variant may be produced even when it exclusively matches
727+
/// valid UTF-8. This is because a `Bytes` variant represents an intention by
728+
/// the author of the regular expression to disable Unicode mode, which in turn
729+
/// impacts the semantics of case insensitive matching. For example, `(?i)k`
730+
/// and `(?i-u)k` will not match the same set of strings.
748731
#[derive(Clone, Debug, Eq, PartialEq)]
749732
pub enum Class {
750733
/// A set of characters represented by Unicode scalar values.
@@ -2222,12 +2205,6 @@ mod tests {
22222205
assert_eq!(expected, bsymdifference(&cls1, &cls2));
22232206
}
22242207

2225-
#[test]
2226-
#[should_panic]
2227-
fn hir_byte_literal_non_ascii() {
2228-
Hir::literal(Literal::Byte(b'a'));
2229-
}
2230-
22312208
// We use a thread with an explicit stack size to test that our destructor
22322209
// for Hir can handle arbitrarily sized expressions in constant stack
22332210
// space. In case we run on a platform without threads (WASM?), we limit

regex-syntax/src/hir/print.rs

Lines changed: 79 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -92,11 +92,37 @@ impl<W: fmt::Write> Visitor for Writer<W> {
9292
// Empty is represented by nothing in the concrete syntax, and
9393
// repetition operators are strictly suffix oriented.
9494
HirKind::Empty | HirKind::Repetition(_) => {}
95-
HirKind::Literal(hir::Literal::Unicode(c)) => {
96-
self.write_literal_char(c)?;
97-
}
98-
HirKind::Literal(hir::Literal::Byte(b)) => {
99-
self.write_literal_byte(b)?;
95+
HirKind::Literal(hir::Literal(ref bytes)) => {
96+
// See the comment on the 'Concat' and 'Alternation' case below
97+
// for why we put parens here. Literals are, conceptually,
98+
// a special case of concatenation where each element is a
99+
// character. The HIR flattens this into a Box<[u8]>, but we
100+
// still need to treat it like a concatenation for correct
101+
// printing. As a special case, we don't write parens if there
102+
// is only one character. One character means there is no
103+
// concat so we don't need parens. Adding parens would still be
104+
// correct, but we drop them here because it tends to create
105+
// rather noisy regexes even in simple cases.
106+
let result = core::str::from_utf8(bytes);
107+
let len = result.map_or(bytes.len(), |s| s.chars().count());
108+
if len > 1 {
109+
self.wtr.write_str(r"(?:")?;
110+
}
111+
match result {
112+
Ok(string) => {
113+
for c in string.chars() {
114+
self.write_literal_char(c)?;
115+
}
116+
}
117+
Err(_) => {
118+
for &b in bytes.iter() {
119+
self.write_literal_byte(b)?;
120+
}
121+
}
122+
}
123+
if len > 1 {
124+
self.wtr.write_str(r")")?;
125+
}
100126
}
101127
HirKind::Class(hir::Class::Unicode(ref cls)) => {
102128
self.wtr.write_str("[")?;
@@ -429,19 +455,31 @@ mod tests {
429455
#[test]
430456
fn regression_repetition_concat() {
431457
let expr = Hir::concat(alloc::vec![
432-
Hir::literal(hir::Literal::Unicode('x')),
458+
Hir::literal("x".as_bytes()),
459+
Hir::repetition(hir::Repetition {
460+
min: 1,
461+
max: None,
462+
greedy: true,
463+
hir: Box::new(Hir::literal("ab".as_bytes())),
464+
}),
465+
Hir::literal("y".as_bytes()),
466+
]);
467+
assert_eq!(r"(?:x(?:ab)+y)", expr.to_string());
468+
469+
let expr = Hir::concat(alloc::vec![
470+
Hir::look(hir::Look::Start),
433471
Hir::repetition(hir::Repetition {
434472
min: 1,
435473
max: None,
436474
greedy: true,
437475
hir: Box::new(Hir::concat(alloc::vec![
438-
Hir::literal(hir::Literal::Unicode('a')),
439-
Hir::literal(hir::Literal::Unicode('b')),
476+
Hir::look(hir::Look::Start),
477+
Hir::look(hir::Look::End),
440478
])),
441479
}),
442-
Hir::literal(hir::Literal::Unicode('y')),
480+
Hir::look(hir::Look::End),
443481
]);
444-
assert_eq!(r"(?:x(?:ab)+y)", expr.to_string());
482+
assert_eq!(r"(?:\A(?:\A\z)+\z)", expr.to_string());
445483
}
446484

447485
// Just like regression_repetition_concat, but with the repetition using
@@ -451,19 +489,34 @@ mod tests {
451489
#[test]
452490
fn regression_repetition_alternation() {
453491
let expr = Hir::concat(alloc::vec![
454-
Hir::literal(hir::Literal::Unicode('x')),
492+
Hir::literal("x".as_bytes()),
455493
Hir::repetition(hir::Repetition {
456494
min: 1,
457495
max: None,
458496
greedy: true,
459497
hir: Box::new(Hir::alternation(alloc::vec![
460-
Hir::literal(hir::Literal::Unicode('a')),
461-
Hir::literal(hir::Literal::Unicode('b')),
498+
Hir::literal("a".as_bytes()),
499+
Hir::literal("b".as_bytes()),
462500
])),
463501
}),
464-
Hir::literal(hir::Literal::Unicode('y')),
502+
Hir::literal("y".as_bytes()),
465503
]);
466504
assert_eq!(r"(?:x(?:a|b)+y)", expr.to_string());
505+
506+
let expr = Hir::concat(alloc::vec![
507+
Hir::look(hir::Look::Start),
508+
Hir::repetition(hir::Repetition {
509+
min: 1,
510+
max: None,
511+
greedy: true,
512+
hir: Box::new(Hir::alternation(alloc::vec![
513+
Hir::look(hir::Look::Start),
514+
Hir::look(hir::Look::End),
515+
])),
516+
}),
517+
Hir::look(hir::Look::End),
518+
]);
519+
assert_eq!(r"(?:\A(?:\A|\z)+\z)", expr.to_string());
467520
}
468521

469522
// This regression test is very similar in flavor to
@@ -480,12 +533,21 @@ mod tests {
480533
#[test]
481534
fn regression_alternation_concat() {
482535
let expr = Hir::concat(alloc::vec![
483-
Hir::literal(hir::Literal::Unicode('a')),
536+
Hir::literal("a".as_bytes()),
484537
Hir::alternation(alloc::vec![
485-
Hir::literal(hir::Literal::Unicode('b')),
486-
Hir::literal(hir::Literal::Unicode('c')),
538+
Hir::literal("b".as_bytes()),
539+
Hir::literal("c".as_bytes()),
487540
]),
488541
]);
489542
assert_eq!(r"(?:a(?:b|c))", expr.to_string());
543+
544+
let expr = Hir::concat(alloc::vec![
545+
Hir::look(hir::Look::Start),
546+
Hir::alternation(alloc::vec![
547+
Hir::look(hir::Look::Start),
548+
Hir::look(hir::Look::End),
549+
]),
550+
]);
551+
assert_eq!(r"(?:\A(?:\A|\z))", expr.to_string());
490552
}
491553
}

0 commit comments

Comments
 (0)