syntax: improve Debug impls

BurntSushi · BurntSushi · commit 1fdb809d6cb6 · 2022-10-09T19:07:46.000-04:00
This makes the Debug impls for Literal and ClassRangeBytes a bit better.
The former in particular. Instead of just printing a sequence of decimal
numbers, we now print them as characters.

Given the lackluster support for Vec&lt;u8&gt; as a string in the standard
library, we copy a little bit of code from regex-automata to make the
debug print for the Vec&lt;u8&gt; basically as nice as a String.
diff --git a/regex-syntax/src/debug.rs b/regex-syntax/src/debug.rs
@@ -0,0 +1,108 @@
+/// A type that wraps a single byte with a convenient fmt::Debug impl that
+/// escapes the byte.
+pub(crate) struct Byte(pub(crate) u8);
+
+impl core::fmt::Debug for Byte {
+    fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+        // Special case ASCII space. It's too hard to read otherwise, so
+        // put quotes around it. I sometimes wonder whether just '\x20' would
+        // be better...
+        if self.0 == b' ' {
+            return write!(f, "' '");
+        }
+        // 10 bytes is enough to cover any output from ascii::escape_default.
+        let mut bytes = [0u8; 10];
+        let mut len = 0;
+        for (i, mut b) in core::ascii::escape_default(self.0).enumerate() {
+            // capitalize \xab to \xAB
+            if i >= 2 && b'a' <= b && b <= b'f' {
+                b -= 32;
+            }
+            bytes[len] = b;
+            len += 1;
+        }
+        write!(f, "{}", core::str::from_utf8(&bytes[..len]).unwrap())
+    }
+}
+
+/// A type that provides a human readable debug impl for arbitrary bytes.
+///
+/// This generally works best when the bytes are presumed to be mostly UTF-8,
+/// but will work for anything.
+///
+/// N.B. This is copied nearly verbatim from regex-automata. Sigh.
+pub(crate) struct Bytes<'a>(pub(crate) &'a [u8]);
+
+impl<'a> core::fmt::Debug for Bytes<'a> {
+    fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+        write!(f, "\"")?;
+        // This is a sad re-implementation of a similar impl found in bstr.
+        let mut bytes = self.0;
+        while let Some(result) = utf8_decode(bytes) {
+            let ch = match result {
+                Ok(ch) => ch,
+                Err(byte) => {
+                    write!(f, r"\x{:02x}", byte)?;
+                    bytes = &bytes[1..];
+                    continue;
+                }
+            };
+            bytes = &bytes[ch.len_utf8()..];
+            match ch {
+                '\0' => write!(f, "\\0")?,
+                // ASCII control characters except \0, \n, \r, \t
+                '\x01'..='\x08'
+                | '\x0b'
+                | '\x0c'
+                | '\x0e'..='\x19'
+                | '\x7f' => {
+                    write!(f, "\\x{:02x}", u32::from(ch))?;
+                }
+                '\n' | '\r' | '\t' | _ => {
+                    write!(f, "{}", ch.escape_debug())?;
+                }
+            }
+        }
+        write!(f, "\"")?;
+        Ok(())
+    }
+}
+
+/// Decodes the next UTF-8 encoded codepoint from the given byte slice.
+///
+/// If no valid encoding of a codepoint exists at the beginning of the given
+/// byte slice, then the first byte is returned instead.
+///
+/// This returns `None` if and only if `bytes` is empty.
+fn utf8_decode(bytes: &[u8]) -> Option<Result<char, u8>> {
+    if bytes.is_empty() {
+        return None;
+    }
+    match core::str::from_utf8(&bytes[..core::cmp::min(4, bytes.len())]) {
+        Ok(s) => Some(Ok(s.chars().next().unwrap())),
+        Err(_) => Some(Err(bytes[0])),
+    }
+}
+
+/*
+/// Given a UTF-8 leading byte, this returns the total number of code units
+/// in the following encoded codepoint.
+///
+/// If the given byte is not a valid UTF-8 leading byte, then this returns
+/// `None`.
+fn len(byte: u8) -> Option<usize> {
+    if byte <= 0x7F {
+        return Some(1);
+    } else if byte & 0b1100_0000 == 0b1000_0000 {
+        return None;
+    } else if byte <= 0b1101_1111 {
+        Some(2)
+    } else if byte <= 0b1110_1111 {
+        Some(3)
+    } else if byte <= 0b1111_0111 {
+        Some(4)
+    } else {
+        None
+    }
+}
+*/
diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs
@@ -699,8 +699,7 @@ impl HirKind {
 /// to the size of the `Hir`.
 impl core::fmt::Display for Hir {
     fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
-        use crate::hir::print::Printer;
-        Printer::new().print(self, f)
+        crate::hir::print::Printer::new().print(self, f)
     }
 }
 
@@ -710,9 +709,15 @@ impl core::fmt::Display for Hir {
 /// defined by a Unicode scalar value or an arbitrary byte. Unicode characters
 /// are preferred whenever possible. In particular, a `Byte` variant is only
 /// ever produced when it could match invalid UTF-8.
-#[derive(Clone, Debug, Eq, PartialEq)]
+#[derive(Clone, Eq, PartialEq)]
 pub struct Literal(pub Box<[u8]>);
 
+impl core::fmt::Debug for Literal {
+    fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+        crate::debug::Bytes(&self.0).fmt(f)
+    }
+}
+
 /// The high-level intermediate representation of a character class.
 ///
 /// A character class corresponds to a set of characters. A character is either
@@ -1262,20 +1267,10 @@ impl ClassBytesRange {
 
 impl core::fmt::Debug for ClassBytesRange {
     fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
-        let mut debug = f.debug_struct("ClassBytesRange");
-        if self.start <= 0x7F {
-            let ch = char::try_from(self.start).unwrap();
-            debug.field("start", &ch);
-        } else {
-            debug.field("start", &self.start);
-        }
-        if self.end <= 0x7F {
-            let ch = char::try_from(self.start).unwrap();
-            debug.field("end", &ch);
-        } else {
-            debug.field("end", &self.end);
-        }
-        debug.finish()
+        f.debug_struct("ClassBytesRange")
+            .field("start", &crate::debug::Byte(self.start))
+            .field("end", &crate::debug::Byte(self.end))
+            .finish()
     }
 }
 
diff --git a/regex-syntax/src/hir/translate.rs b/regex-syntax/src/hir/translate.rs
@@ -1326,7 +1326,7 @@ mod tests {
     }
 
     fn hir_lit(s: &str) -> Hir {
-        Hir::literal(s.as_bytes())
+        hir_blit(s.as_bytes())
     }
 
     fn hir_blit(s: &[u8]) -> Hir {
diff --git a/regex-syntax/src/lib.rs b/regex-syntax/src/lib.rs
@@ -180,6 +180,7 @@ pub use crate::{
 use alloc::string::String;
 
 pub mod ast;
+mod debug;
 mod either;
 mod error;
 pub mod hir;

Original file line number	Diff line number	Diff line change
`@@ -1326,7 +1326,7 @@ mod tests {`
`1326`	`1326`	`}`
`1327`	`1327`
`1328`	`1328`	`fn hir_lit(s: &str) -> Hir {`
`1329`		`- Hir::literal(s.as_bytes())`
	`1329`	`+ hir_blit(s.as_bytes())`
`1330`	`1330`	`}`
`1331`	`1331`
`1332`	`1332`	`fn hir_blit(s: &[u8]) -> Hir {`