Skip to content

Commit 1fdb809

Browse files
committed
syntax: improve Debug impls
This makes the Debug impls for Literal and ClassRangeBytes a bit better. The former in particular. Instead of just printing a sequence of decimal numbers, we now print them as characters. Given the lackluster support for Vec<u8> as a string in the standard library, we copy a little bit of code from regex-automata to make the debug print for the Vec<u8> basically as nice as a String.
1 parent 0e95c01 commit 1fdb809

File tree

4 files changed

+122
-18
lines changed

4 files changed

+122
-18
lines changed

regex-syntax/src/debug.rs

Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
/// A type that wraps a single byte with a convenient fmt::Debug impl that
2+
/// escapes the byte.
3+
pub(crate) struct Byte(pub(crate) u8);
4+
5+
impl core::fmt::Debug for Byte {
6+
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
7+
// Special case ASCII space. It's too hard to read otherwise, so
8+
// put quotes around it. I sometimes wonder whether just '\x20' would
9+
// be better...
10+
if self.0 == b' ' {
11+
return write!(f, "' '");
12+
}
13+
// 10 bytes is enough to cover any output from ascii::escape_default.
14+
let mut bytes = [0u8; 10];
15+
let mut len = 0;
16+
for (i, mut b) in core::ascii::escape_default(self.0).enumerate() {
17+
// capitalize \xab to \xAB
18+
if i >= 2 && b'a' <= b && b <= b'f' {
19+
b -= 32;
20+
}
21+
bytes[len] = b;
22+
len += 1;
23+
}
24+
write!(f, "{}", core::str::from_utf8(&bytes[..len]).unwrap())
25+
}
26+
}
27+
28+
/// A type that provides a human readable debug impl for arbitrary bytes.
29+
///
30+
/// This generally works best when the bytes are presumed to be mostly UTF-8,
31+
/// but will work for anything.
32+
///
33+
/// N.B. This is copied nearly verbatim from regex-automata. Sigh.
34+
pub(crate) struct Bytes<'a>(pub(crate) &'a [u8]);
35+
36+
impl<'a> core::fmt::Debug for Bytes<'a> {
37+
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
38+
write!(f, "\"")?;
39+
// This is a sad re-implementation of a similar impl found in bstr.
40+
let mut bytes = self.0;
41+
while let Some(result) = utf8_decode(bytes) {
42+
let ch = match result {
43+
Ok(ch) => ch,
44+
Err(byte) => {
45+
write!(f, r"\x{:02x}", byte)?;
46+
bytes = &bytes[1..];
47+
continue;
48+
}
49+
};
50+
bytes = &bytes[ch.len_utf8()..];
51+
match ch {
52+
'\0' => write!(f, "\\0")?,
53+
// ASCII control characters except \0, \n, \r, \t
54+
'\x01'..='\x08'
55+
| '\x0b'
56+
| '\x0c'
57+
| '\x0e'..='\x19'
58+
| '\x7f' => {
59+
write!(f, "\\x{:02x}", u32::from(ch))?;
60+
}
61+
'\n' | '\r' | '\t' | _ => {
62+
write!(f, "{}", ch.escape_debug())?;
63+
}
64+
}
65+
}
66+
write!(f, "\"")?;
67+
Ok(())
68+
}
69+
}
70+
71+
/// Decodes the next UTF-8 encoded codepoint from the given byte slice.
72+
///
73+
/// If no valid encoding of a codepoint exists at the beginning of the given
74+
/// byte slice, then the first byte is returned instead.
75+
///
76+
/// This returns `None` if and only if `bytes` is empty.
77+
fn utf8_decode(bytes: &[u8]) -> Option<Result<char, u8>> {
78+
if bytes.is_empty() {
79+
return None;
80+
}
81+
match core::str::from_utf8(&bytes[..core::cmp::min(4, bytes.len())]) {
82+
Ok(s) => Some(Ok(s.chars().next().unwrap())),
83+
Err(_) => Some(Err(bytes[0])),
84+
}
85+
}
86+
87+
/*
88+
/// Given a UTF-8 leading byte, this returns the total number of code units
89+
/// in the following encoded codepoint.
90+
///
91+
/// If the given byte is not a valid UTF-8 leading byte, then this returns
92+
/// `None`.
93+
fn len(byte: u8) -> Option<usize> {
94+
if byte <= 0x7F {
95+
return Some(1);
96+
} else if byte & 0b1100_0000 == 0b1000_0000 {
97+
return None;
98+
} else if byte <= 0b1101_1111 {
99+
Some(2)
100+
} else if byte <= 0b1110_1111 {
101+
Some(3)
102+
} else if byte <= 0b1111_0111 {
103+
Some(4)
104+
} else {
105+
None
106+
}
107+
}
108+
*/

regex-syntax/src/hir/mod.rs

Lines changed: 12 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -699,8 +699,7 @@ impl HirKind {
699699
/// to the size of the `Hir`.
700700
impl core::fmt::Display for Hir {
701701
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
702-
use crate::hir::print::Printer;
703-
Printer::new().print(self, f)
702+
crate::hir::print::Printer::new().print(self, f)
704703
}
705704
}
706705

@@ -710,9 +709,15 @@ impl core::fmt::Display for Hir {
710709
/// defined by a Unicode scalar value or an arbitrary byte. Unicode characters
711710
/// are preferred whenever possible. In particular, a `Byte` variant is only
712711
/// ever produced when it could match invalid UTF-8.
713-
#[derive(Clone, Debug, Eq, PartialEq)]
712+
#[derive(Clone, Eq, PartialEq)]
714713
pub struct Literal(pub Box<[u8]>);
715714

715+
impl core::fmt::Debug for Literal {
716+
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
717+
crate::debug::Bytes(&self.0).fmt(f)
718+
}
719+
}
720+
716721
/// The high-level intermediate representation of a character class.
717722
///
718723
/// A character class corresponds to a set of characters. A character is either
@@ -1262,20 +1267,10 @@ impl ClassBytesRange {
12621267

12631268
impl core::fmt::Debug for ClassBytesRange {
12641269
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
1265-
let mut debug = f.debug_struct("ClassBytesRange");
1266-
if self.start <= 0x7F {
1267-
let ch = char::try_from(self.start).unwrap();
1268-
debug.field("start", &ch);
1269-
} else {
1270-
debug.field("start", &self.start);
1271-
}
1272-
if self.end <= 0x7F {
1273-
let ch = char::try_from(self.start).unwrap();
1274-
debug.field("end", &ch);
1275-
} else {
1276-
debug.field("end", &self.end);
1277-
}
1278-
debug.finish()
1270+
f.debug_struct("ClassBytesRange")
1271+
.field("start", &crate::debug::Byte(self.start))
1272+
.field("end", &crate::debug::Byte(self.end))
1273+
.finish()
12791274
}
12801275
}
12811276

regex-syntax/src/hir/translate.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1326,7 +1326,7 @@ mod tests {
13261326
}
13271327

13281328
fn hir_lit(s: &str) -> Hir {
1329-
Hir::literal(s.as_bytes())
1329+
hir_blit(s.as_bytes())
13301330
}
13311331

13321332
fn hir_blit(s: &[u8]) -> Hir {

regex-syntax/src/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -180,6 +180,7 @@ pub use crate::{
180180
use alloc::string::String;
181181

182182
pub mod ast;
183+
mod debug;
183184
mod either;
184185
mod error;
185186
pub mod hir;

0 commit comments

Comments
 (0)