rust-lang
diff --git a/‎regex-syntax/src/hir/literal/mod.rs
Lines changed: 6 additions & 15 deletions b/‎regex-syntax/src/hir/literal/mod.rs
Lines changed: 6 additions & 15 deletions
diff --git a/‎regex-syntax/src/hir/mod.rs
Lines changed: 13 additions & 36 deletions b/‎regex-syntax/src/hir/mod.rs
Lines changed: 13 additions & 36 deletions
diff --git a/‎regex-syntax/src/hir/print.rs
Lines changed: 79 additions & 17 deletions b/‎regex-syntax/src/hir/print.rs
Lines changed: 79 additions & 17 deletions
@@ -582,12 +582,8 @@ impl Literals {
 
 fn prefixes(expr: &Hir, lits: &mut Literals) {
     match *expr.kind() {
-        HirKind::Literal(hir::Literal::Unicode(c)) => {
-            let mut buf = [0; 4];
-            lits.cross_add(c.encode_utf8(&mut buf).as_bytes());
-        }
-        HirKind::Literal(hir::Literal::Byte(b)) => {
-            lits.cross_add(&[b]);
+        HirKind::Literal(hir::Literal(ref bytes)) => {
+            lits.cross_add(bytes);
         }
         HirKind::Class(hir::Class::Unicode(ref cls)) => {
             if !lits.add_char_class(cls) {
@@ -648,15 +644,10 @@ fn prefixes(expr: &Hir, lits: &mut Literals) {
 
 fn suffixes(expr: &Hir, lits: &mut Literals) {
     match *expr.kind() {
-        HirKind::Literal(hir::Literal::Unicode(c)) => {
-            let mut buf = [0u8; 4];
-            let i = c.encode_utf8(&mut buf).len();
-            let buf = &mut buf[..i];
-            buf.reverse();
-            lits.cross_add(buf);
-        }
-        HirKind::Literal(hir::Literal::Byte(b)) => {
-            lits.cross_add(&[b]);
+        HirKind::Literal(hir::Literal(ref bytes)) => {
+            let mut bytes = bytes.to_vec();
+            bytes.reverse();
+            lits.cross_add(&bytes);
         }
         HirKind::Class(hir::Class::Unicode(ref cls)) => {
             if !lits.add_char_class_reverse(cls) {
 
@@ -169,7 +169,7 @@ pub enum HirKind {
     /// The empty regular expression, which matches everything, including the
     /// empty string.
     Empty,
-    /// A single literal character that matches exactly this character.
+    /// A literalstring that matches exactly these bytes.
     Literal(Literal),
     /// A single character class that matches any of the characters in the
     /// class. A class can either consist of Unicode scalar values as
@@ -231,13 +231,14 @@ impl Hir {
     /// If the given literal has a `Byte` variant with an ASCII byte, then this
     /// method panics. This enforces the invariant that `Byte` variants are
     /// only used to express matching of invalid UTF-8.
-    pub fn literal(lit: Literal) -> Hir {
-        if let Literal::Byte(b) = lit {
-            assert!(b > 0x7F);
+    pub fn literal<B: Into<Box<[u8]>>>(lit: B) -> Hir {
+        let bytes = lit.into();
+        if bytes.is_empty() {
+            return Hir::empty();
         }
 
         let mut info = HirInfo::new();
-        info.set_always_utf8(lit.is_unicode());
+        info.set_always_utf8(core::str::from_utf8(&bytes).is_ok());
         info.set_all_assertions(false);
         info.set_anchored_start(false);
         info.set_anchored_end(false);
@@ -248,7 +249,7 @@ impl Hir {
         info.set_match_empty(false);
         info.set_literal(true);
         info.set_alternation_literal(true);
-        Hir { kind: HirKind::Literal(lit), info }
+        Hir { kind: HirKind::Literal(Literal(bytes)), info }
     }
 
     /// Creates a class HIR expression.
@@ -710,24 +711,7 @@ impl core::fmt::Display for Hir {
 /// are preferred whenever possible. In particular, a `Byte` variant is only
 /// ever produced when it could match invalid UTF-8.
 #[derive(Clone, Debug, Eq, PartialEq)]
-pub enum Literal {
-    /// A single character represented by a Unicode scalar value.
-    Unicode(char),
-    /// A single character represented by an arbitrary byte.
-    Byte(u8),
-}
-
-impl Literal {
-    /// Returns true if and only if this literal corresponds to a Unicode
-    /// scalar value.
-    pub fn is_unicode(&self) -> bool {
-        match *self {
-            Literal::Unicode(_) => true,
-            Literal::Byte(b) if b <= 0x7F => true,
-            Literal::Byte(_) => false,
-        }
-    }
-}
+pub struct Literal(pub Box<[u8]>);
 
 /// The high-level intermediate representation of a character class.
 ///
@@ -739,12 +723,11 @@ impl Literal {
 /// A character class, regardless of its character type, is represented by a
 /// sequence of non-overlapping non-adjacent ranges of characters.
 ///
-/// Note that unlike [`Literal`], a `Bytes` variant may be produced even when
-/// it exclusively matches valid UTF-8. This is because a `Bytes` variant
-/// represents an intention by the author of the regular expression to disable
-/// Unicode mode, which in turn impacts the semantics of case insensitive
-/// matching. For example, `(?i)k` and `(?i-u)k` will not match the same set of
-/// strings.
+/// Note that `Bytes` variant may be produced even when it exclusively matches
+/// valid UTF-8. This is because a `Bytes` variant represents an intention by
+/// the author of the regular expression to disable Unicode mode, which in turn
+/// impacts the semantics of case insensitive matching. For example, `(?i)k`
+/// and `(?i-u)k` will not match the same set of strings.
 #[derive(Clone, Debug, Eq, PartialEq)]
 pub enum Class {
     /// A set of characters represented by Unicode scalar values.
@@ -2222,12 +2205,6 @@ mod tests {
         assert_eq!(expected, bsymdifference(&cls1, &cls2));
     }
 
-    #[test]
-    #[should_panic]
-    fn hir_byte_literal_non_ascii() {
-        Hir::literal(Literal::Byte(b'a'));
-    }
-
     // We use a thread with an explicit stack size to test that our destructor
     // for Hir can handle arbitrarily sized expressions in constant stack
     // space. In case we run on a platform without threads (WASM?), we limit
 
@@ -92,11 +92,37 @@ impl<W: fmt::Write> Visitor for Writer<W> {
             // Empty is represented by nothing in the concrete syntax, and
             // repetition operators are strictly suffix oriented.
             HirKind::Empty | HirKind::Repetition(_) => {}
-            HirKind::Literal(hir::Literal::Unicode(c)) => {
-                self.write_literal_char(c)?;
-            }
-            HirKind::Literal(hir::Literal::Byte(b)) => {
-                self.write_literal_byte(b)?;
+            HirKind::Literal(hir::Literal(ref bytes)) => {
+                // See the comment on the 'Concat' and 'Alternation' case below
+                // for why we put parens here. Literals are, conceptually,
+                // a special case of concatenation where each element is a
+                // character. The HIR flattens this into a Box<[u8]>, but we
+                // still need to treat it like a concatenation for correct
+                // printing. As a special case, we don't write parens if there
+                // is only one character. One character means there is no
+                // concat so we don't need parens. Adding parens would still be
+                // correct, but we drop them here because it tends to create
+                // rather noisy regexes even in simple cases.
+                let result = core::str::from_utf8(bytes);
+                let len = result.map_or(bytes.len(), |s| s.chars().count());
+                if len > 1 {
+                    self.wtr.write_str(r"(?:")?;
+                }
+                match result {
+                    Ok(string) => {
+                        for c in string.chars() {
+                            self.write_literal_char(c)?;
+                        }
+                    }
+                    Err(_) => {
+                        for &b in bytes.iter() {
+                            self.write_literal_byte(b)?;
+                        }
+                    }
+                }
+                if len > 1 {
+                    self.wtr.write_str(r")")?;
+                }
             }
             HirKind::Class(hir::Class::Unicode(ref cls)) => {
                 self.wtr.write_str("[")?;
@@ -429,19 +455,31 @@ mod tests {
     #[test]
     fn regression_repetition_concat() {
         let expr = Hir::concat(alloc::vec![
-            Hir::literal(hir::Literal::Unicode('x')),
+            Hir::literal("x".as_bytes()),
+            Hir::repetition(hir::Repetition {
+                min: 1,
+                max: None,
+                greedy: true,
+                hir: Box::new(Hir::literal("ab".as_bytes())),
+            }),
+            Hir::literal("y".as_bytes()),
+        ]);
+        assert_eq!(r"(?:x(?:ab)+y)", expr.to_string());
+
+        let expr = Hir::concat(alloc::vec![
+            Hir::look(hir::Look::Start),
             Hir::repetition(hir::Repetition {
                 min: 1,
                 max: None,
                 greedy: true,
                 hir: Box::new(Hir::concat(alloc::vec![
-                    Hir::literal(hir::Literal::Unicode('a')),
-                    Hir::literal(hir::Literal::Unicode('b')),
+                    Hir::look(hir::Look::Start),
+                    Hir::look(hir::Look::End),
                 ])),
             }),
-            Hir::literal(hir::Literal::Unicode('y')),
+            Hir::look(hir::Look::End),
         ]);
-        assert_eq!(r"(?:x(?:ab)+y)", expr.to_string());
+        assert_eq!(r"(?:\A(?:\A\z)+\z)", expr.to_string());
     }
 
     // Just like regression_repetition_concat, but with the repetition using
@@ -451,19 +489,34 @@ mod tests {
     #[test]
     fn regression_repetition_alternation() {
         let expr = Hir::concat(alloc::vec![
-            Hir::literal(hir::Literal::Unicode('x')),
+            Hir::literal("x".as_bytes()),
             Hir::repetition(hir::Repetition {
                 min: 1,
                 max: None,
                 greedy: true,
                 hir: Box::new(Hir::alternation(alloc::vec![
-                    Hir::literal(hir::Literal::Unicode('a')),
-                    Hir::literal(hir::Literal::Unicode('b')),
+                    Hir::literal("a".as_bytes()),
+                    Hir::literal("b".as_bytes()),
                 ])),
             }),
-            Hir::literal(hir::Literal::Unicode('y')),
+            Hir::literal("y".as_bytes()),
         ]);
         assert_eq!(r"(?:x(?:a|b)+y)", expr.to_string());
+
+        let expr = Hir::concat(alloc::vec![
+            Hir::look(hir::Look::Start),
+            Hir::repetition(hir::Repetition {
+                min: 1,
+                max: None,
+                greedy: true,
+                hir: Box::new(Hir::alternation(alloc::vec![
+                    Hir::look(hir::Look::Start),
+                    Hir::look(hir::Look::End),
+                ])),
+            }),
+            Hir::look(hir::Look::End),
+        ]);
+        assert_eq!(r"(?:\A(?:\A|\z)+\z)", expr.to_string());
     }
 
     // This regression test is very similar in flavor to
@@ -480,12 +533,21 @@ mod tests {
     #[test]
     fn regression_alternation_concat() {
         let expr = Hir::concat(alloc::vec![
-            Hir::literal(hir::Literal::Unicode('a')),
+            Hir::literal("a".as_bytes()),
             Hir::alternation(alloc::vec![
-                Hir::literal(hir::Literal::Unicode('b')),
-                Hir::literal(hir::Literal::Unicode('c')),
+                Hir::literal("b".as_bytes()),
+                Hir::literal("c".as_bytes()),
             ]),
         ]);
         assert_eq!(r"(?:a(?:b|c))", expr.to_string());
+
+        let expr = Hir::concat(alloc::vec![
+            Hir::look(hir::Look::Start),
+            Hir::alternation(alloc::vec![
+                Hir::look(hir::Look::Start),
+                Hir::look(hir::Look::End),
+            ]),
+        ]);
+        assert_eq!(r"(?:\A(?:\A|\z))", expr.to_string());
     }
 }