From b2b65ec35f3bc84dd75081d73f9ef575d91cb39d Mon Sep 17 00:00:00 2001
From: Owen Shepherd <owen@owen.cafe>
Date: Mon, 27 Jan 2025 22:36:50 +0000
Subject: [PATCH] Add option for posix-lex-compatible regex escapes

In posix lex, '\b' represents the backspace character.
In the rust 'regex' crate, it represents a word boundary assertion.

This patch adds an option to maintain posix-lex semantics, and tests that
all escapes are interpreted correctly, under both sets of semantics.
---
 lrlex/src/lib/ctbuilder.rs |  9 ++++++
 lrlex/src/lib/lexer.rs     | 64 ++++++++++++++++++++++++++++++++++++++
 lrlex/src/lib/parser.rs    | 18 ++++++++---
 3 files changed, 86 insertions(+), 5 deletions(-)
diff --git a/lrlex/src/lib/ctbuilder.rs b/lrlex/src/lib/ctbuilder.rs
index a70bd0167..90ac46b8c 100644
--- a/lrlex/src/lib/ctbuilder.rs
+++ b/lrlex/src/lib/ctbuilder.rs
@@ -443,6 +443,7 @@ pub fn lexerdef() -> {lexerdef_type} {{
             dot_matches_new_line: {dot_matches_new_line:?},
             multi_line: {multi_line:?},
             octal: {octal:?},
+            posix_escapes: {posix_escapes:?},
             case_insensitive: {case_insensitive:?},
             unicode: {unicode:?},
             swap_greed: {swap_greed:?},
@@ -454,6 +455,7 @@ pub fn lexerdef() -> {lexerdef_type} {{
             dot_matches_new_line = self.regex_options.dot_matches_new_line,
             multi_line = self.regex_options.multi_line,
             octal = self.regex_options.octal,
+            posix_escapes = self.regex_options.posix_escapes,
             case_insensitive = self.regex_options.case_insensitive,
             unicode = self.regex_options.unicode,
             swap_greed = self.regex_options.swap_greed,
@@ -668,6 +670,13 @@ pub fn lexerdef() -> {lexerdef_type} {{
         self
     }
 
+    /// Sets the `regex::RegexBuilder` option of the same name.
+    /// The default value is `false`.
+    pub fn posix_escapes(mut self, flag: bool) -> Self {
+        self.regex_options.posix_escapes = flag;
+        self
+    }
+
     /// Sets the `regex::RegexBuilder` option of the same name.
     /// The default value is `true`.
     pub fn octal(mut self, flag: bool) -> Self {
diff --git a/lrlex/src/lib/lexer.rs b/lrlex/src/lib/lexer.rs
index 10afa0277..8b39807b6 100644
--- a/lrlex/src/lib/lexer.rs
+++ b/lrlex/src/lib/lexer.rs
@@ -25,6 +25,7 @@ pub struct RegexOptions {
     pub dot_matches_new_line: bool,
     pub multi_line: bool,
     pub octal: bool,
+    pub posix_escapes: bool,
     pub case_insensitive: Option<bool>,
     pub swap_greed: Option<bool>,
     pub ignore_whitespace: Option<bool>,
@@ -38,6 +39,7 @@ pub const DEFAULT_REGEX_OPTIONS: RegexOptions = RegexOptions {
     dot_matches_new_line: true,
     multi_line: true,
     octal: true,
+    posix_escapes: false,
     case_insensitive: None,
     ignore_whitespace: None,
     swap_greed: None,
@@ -649,6 +651,68 @@ mod test {
         assert_eq!(lex2.span().len(), 3);
     }
 
+    #[test]
+    fn test_posix_escapes() {
+        let src = r#"%%
+\\ 'slash'
+\a 'alert'
+\b 'backspace'
+\f 'feed'
+\n 'newline'
+\r 'return'
+\t 'tab'
+\v 'vtab'
+\q 'normal_char'
+"#
+        .to_string();
+        let mut options = DEFAULT_REGEX_OPTIONS;
+        options.posix_escapes = true;
+        let lexerdef =
+            LRNonStreamingLexerDef::<DefaultLexerTypes<u8>>::new_with_options(&src, options)
+                .unwrap();
+        let lexemes = lexerdef
+            .lexer("\\\x07\x08\x0c\n\r\t\x0bq")
+            .iter()
+            .map(|x| x.unwrap())
+            .collect::<Vec<_>>();
+        assert_eq!(lexemes.len(), 9);
+        for i in 0..9u8 {
+            let lexeme = lexemes[i as usize];
+            assert_eq!(lexeme.tok_id(), i);
+        }
+    }
+
+    #[test]
+    fn test_non_posix_escapes() {
+        let src = r#"%%
+\\ 'slash'
+\a 'alert'
+a\b a 'work_break'
+\f 'feed'
+\n 'newline'
+\r 'return'
+\t 'tab'
+\v 'vtab'
+\q 'normal_char'
+"#
+        .to_string();
+        let mut options = DEFAULT_REGEX_OPTIONS;
+        options.posix_escapes = false;
+        let lexerdef =
+            LRNonStreamingLexerDef::<DefaultLexerTypes<u8>>::new_with_options(&src, options)
+                .unwrap();
+        let lexemes = lexerdef
+            .lexer("\\\x07a a\x0c\n\r\t\x0bq")
+            .iter()
+            .map(|x| x.unwrap())
+            .collect::<Vec<_>>();
+        assert_eq!(lexemes.len(), 9);
+        for i in 0..9u8 {
+            let lexeme = lexemes[i as usize];
+            assert_eq!(lexeme.tok_id(), i);
+        }
+    }
+
     #[test]
     fn test_basic_error() {
         let src = "
diff --git a/lrlex/src/lib/parser.rs b/lrlex/src/lib/parser.rs
index b521e696d..71b15258f 100644
--- a/lrlex/src/lib/parser.rs
+++ b/lrlex/src/lib/parser.rs
@@ -21,7 +21,7 @@ lazy_static! {
         Regex::new(r"^%[xX][a-zA-Z0-9]*$").unwrap();
     // Documented in `Escape sequences` in lexcompatibility.m
     static ref RE_LEX_ESC_LITERAL: Regex =
-        Regex::new(r"^(([xuU][[:xdigit:]])|[[:digit:]]|[afnrtv\\]|[pP]|[dDsSwW]|[AbBz])").unwrap();
+        Regex::new(r"^(([xuU][[:xdigit:]])|[[:digit:]]|[afnrtv\\]|[pP]|[dDsSwW]|[Az])").unwrap();
     // Vertical line separators.
     static ref RE_LINE_SEP: Regex = Regex::new(r"[\p{Pattern_White_Space}&&[\p{Zl}\p{Zp}\n\r\v]]").unwrap();
     static ref RE_LEADING_LINE_SEPS: Regex = Regex::new(r"^[\p{Pattern_White_Space}&&[\p{Zl}\p{Zp}\n\r\v]]*").unwrap();
@@ -490,7 +490,7 @@ where
             /// XBD File Format Notation ( '\\', '\a', '\b', '\f' , '\n', '\r', '\t', '\v' ).
             ///
             /// Meaning: The character 'c', unchanged.
-            fn unescape(re: Cow<str>) -> Cow<str> {
+            fn unescape<'b>(re: Cow<'b, str>, regex_options: &'_ RegexOptions) -> Cow<'b, str> {
                 // POSIX lex has two layers of escaping, there are escapes for the regular
                 // expressions themselves and the escapes which get handled by lex directly.
                 // We can find what the `regex` crate needs to be escaped with `is_meta_character`.
@@ -520,7 +520,7 @@ where
                                 if !(regex_syntax::is_meta_character(c2)
                                     || RE_LEX_ESC_LITERAL.is_match(s))
                                 {
-                                    break (Some((i, s, j, c2)));
+                                    break Some((i, s, j, c2));
                                 }
                             }
                         }
@@ -539,7 +539,15 @@ where
                 let mut last_pos = 0;
 
                 'outer: while let Some((i, s, j, c)) = cursor {
-                    if regex_syntax::is_meta_character(c) || RE_LEX_ESC_LITERAL.is_match(s) {
+                    if c == 'b' {
+                        unescaped.push_str(&re_str[last_pos..i]);
+                        unescaped.push_str(if regex_options.posix_escapes {
+                            "\\x08"
+                        } else {
+                            "\\b"
+                        });
+                        last_pos = j + 1;
+                    } else if regex_syntax::is_meta_character(c) || RE_LEX_ESC_LITERAL.is_match(s) {
                         // For both meta characters and literals we want to push the entire substring
                         // up to and including the c match back into the string still escaped.
                         unescaped.push_str(&re_str[last_pos..j + c.len_utf8()]);
@@ -570,7 +578,7 @@ where
                 Cow::from(unescaped)
             }
 
-            Ok((vec![], unescape(Cow::from(re_str))))
+            Ok((vec![], unescape(Cow::from(re_str), &self.regex_options)))
         } else {
             match re_str.find('>') {
                 None => Err(self.mk_error(LexErrorKind::InvalidStartState, off)),