Fix backspace lexeme escaping

414owen · 414owen · commit 15b2bd441c55 · 2025-01-27T22:40:49.000Z
In posix lex, '\b' represents the backspace character.
In the rust 'regex' crate, it represents a word boundary assertion.

This patch adds a test that all posix escapes are interpreted correctly,
and a fix for the backspace escape incongruity.
diff --git a/lrlex/src/lib/lexer.rs b/lrlex/src/lib/lexer.rs
@@ -649,6 +649,33 @@ mod test {
         assert_eq!(lex2.span().len(), 3);
     }
 
+    #[test]
+    fn test_escapes() {
+        let src = r#"%%
+\\ 'slash'
+\a 'alert'
+\b 'backspace'
+\f 'feed'
+\n 'newline'
+\r 'return'
+\t 'tab'
+\v 'vtab'
+\q 'normal_char'
+"#
+        .to_string();
+        let lexerdef = LRNonStreamingLexerDef::<DefaultLexerTypes<u8>>::from_str(&src).unwrap();
+        let lexemes = lexerdef
+            .lexer("\\\x07\x08\x0c\n\r\t\x0bq")
+            .iter()
+            .map(|x| x.unwrap())
+            .collect::<Vec<_>>();
+        assert_eq!(lexemes.len(), 9);
+        for i in 0..9u8 {
+            let lexeme = lexemes[i as usize];
+            assert_eq!(lexeme.tok_id(), i);
+        }
+    }
+
     #[test]
     fn test_basic_error() {
         let src = "
diff --git a/lrlex/src/lib/parser.rs b/lrlex/src/lib/parser.rs
@@ -21,7 +21,7 @@ lazy_static! {
         Regex::new(r"^%[xX][a-zA-Z0-9]*$").unwrap();
     // Documented in `Escape sequences` in lexcompatibility.m
     static ref RE_LEX_ESC_LITERAL: Regex =
-        Regex::new(r"^(([xuU][[:xdigit:]])|[[:digit:]]|[afnrtv\\]|[pP]|[dDsSwW]|[AbBz])").unwrap();
+        Regex::new(r"^(([xuU][[:xdigit:]])|[[:digit:]]|[afnrtv\\]|[pP]|[dDsSwW]|[Az])").unwrap();
     // Vertical line separators.
     static ref RE_LINE_SEP: Regex = Regex::new(r"[\p{Pattern_White_Space}&&[\p{Zl}\p{Zp}\n\r\v]]").unwrap();
     static ref RE_LEADING_LINE_SEPS: Regex = Regex::new(r"^[\p{Pattern_White_Space}&&[\p{Zl}\p{Zp}\n\r\v]]*").unwrap();
@@ -520,7 +520,7 @@ where
                                 if !(regex_syntax::is_meta_character(c2)
                                     || RE_LEX_ESC_LITERAL.is_match(s))
                                 {
-                                    break (Some((i, s, j, c2)));
+                                    break Some((i, s, j, c2));
                                 }
                             }
                         }
@@ -539,7 +539,11 @@ where
                 let mut last_pos = 0;
 
                 'outer: while let Some((i, s, j, c)) = cursor {
-                    if regex_syntax::is_meta_character(c) || RE_LEX_ESC_LITERAL.is_match(s) {
+                    if c == 'b' {
+                        unescaped.push_str(&re_str[last_pos..i]);
+                        unescaped.push_str("\\x08");
+                        last_pos = j + 1;
+                    } else if regex_syntax::is_meta_character(c) || RE_LEX_ESC_LITERAL.is_match(s) {
                         // For both meta characters and literals we want to push the entire substring
                         // up to and including the c match back into the string still escaped.
                         unescaped.push_str(&re_str[last_pos..j + c.len_utf8()]);