Add posix_escapes to RegexOptions

414owen · 414owen · commit c3b6c27b3a4f · 2025-02-12T22:25:15.000Z
This makes the posix-lex-compatible '\b'-escape change backwards-
compatible, by putting it behind a defaulted-to-off flag.
diff --git a/lrlex/src/lib/ctbuilder.rs b/lrlex/src/lib/ctbuilder.rs
@@ -443,6 +443,7 @@ pub fn lexerdef() -> {lexerdef_type} {{
             dot_matches_new_line: {dot_matches_new_line:?},
             multi_line: {multi_line:?},
             octal: {octal:?},
+            posix_escapes: {posix_escapes:?},
             case_insensitive: {case_insensitive:?},
             unicode: {unicode:?},
             swap_greed: {swap_greed:?},
@@ -454,6 +455,7 @@ pub fn lexerdef() -> {lexerdef_type} {{
             dot_matches_new_line = self.regex_options.dot_matches_new_line,
             multi_line = self.regex_options.multi_line,
             octal = self.regex_options.octal,
+            posix_escapes = self.regex_options.posix_escapes,
             case_insensitive = self.regex_options.case_insensitive,
             unicode = self.regex_options.unicode,
             swap_greed = self.regex_options.swap_greed,
@@ -668,6 +670,13 @@ pub fn lexerdef() -> {lexerdef_type} {{
         self
     }
 
+    /// Sets the `regex::RegexBuilder` option of the same name.
+    /// The default value is `false`.
+    pub fn posix_escapes(mut self, flag: bool) -> Self {
+        self.regex_options.posix_escapes = flag;
+        self
+    }
+
     /// Sets the `regex::RegexBuilder` option of the same name.
     /// The default value is `true`.
     pub fn octal(mut self, flag: bool) -> Self {
diff --git a/lrlex/src/lib/lexer.rs b/lrlex/src/lib/lexer.rs
@@ -25,6 +25,7 @@ pub struct RegexOptions {
     pub dot_matches_new_line: bool,
     pub multi_line: bool,
     pub octal: bool,
+    pub posix_escapes: bool,
     pub case_insensitive: Option<bool>,
     pub swap_greed: Option<bool>,
     pub ignore_whitespace: Option<bool>,
@@ -38,6 +39,7 @@ pub const DEFAULT_REGEX_OPTIONS: RegexOptions = RegexOptions {
     dot_matches_new_line: true,
     multi_line: true,
     octal: true,
+    posix_escapes: false,
     case_insensitive: None,
     ignore_whitespace: None,
     swap_greed: None,
@@ -650,7 +652,7 @@ mod test {
     }
 
     #[test]
-    fn test_escapes() {
+    fn test_posix_escapes() {
         let src = r#"%%
 \\ 'slash'
 \a 'alert'
@@ -663,7 +665,11 @@ mod test {
 \q 'normal_char'
 "#
         .to_string();
-        let lexerdef = LRNonStreamingLexerDef::<DefaultLexerTypes<u8>>::from_str(&src).unwrap();
+        let mut options = DEFAULT_REGEX_OPTIONS;
+        options.posix_escapes = true;
+        let lexerdef =
+            LRNonStreamingLexerDef::<DefaultLexerTypes<u8>>::new_with_options(&src, options)
+                .unwrap();
         let lexemes = lexerdef
             .lexer("\\\x07\x08\x0c\n\r\t\x0bq")
             .iter()
@@ -676,6 +682,37 @@ mod test {
         }
     }
 
+    #[test]
+    fn test_non_posix_escapes() {
+        let src = r#"%%
+\\ 'slash'
+\a 'alert'
+a\b a 'work_break'
+\f 'feed'
+\n 'newline'
+\r 'return'
+\t 'tab'
+\v 'vtab'
+\q 'normal_char'
+"#
+        .to_string();
+        let mut options = DEFAULT_REGEX_OPTIONS;
+        options.posix_escapes = false;
+        let lexerdef =
+            LRNonStreamingLexerDef::<DefaultLexerTypes<u8>>::new_with_options(&src, options)
+                .unwrap();
+        let lexemes = lexerdef
+            .lexer("\\\x07a a\x0c\n\r\t\x0bq")
+            .iter()
+            .map(|x| x.unwrap())
+            .collect::<Vec<_>>();
+        assert_eq!(lexemes.len(), 9);
+        for i in 0..9u8 {
+            let lexeme = lexemes[i as usize];
+            assert_eq!(lexeme.tok_id(), i);
+        }
+    }
+
     #[test]
     fn test_basic_error() {
         let src = "
diff --git a/lrlex/src/lib/parser.rs b/lrlex/src/lib/parser.rs
@@ -490,7 +490,7 @@ where
             /// XBD File Format Notation ( '\\', '\a', '\b', '\f' , '\n', '\r', '\t', '\v' ).
             ///
             /// Meaning: The character 'c', unchanged.
-            fn unescape(re: Cow<str>) -> Cow<str> {
+            fn unescape<'a, 'b>(re: Cow<'b, str>, regex_options: &'a RegexOptions) -> Cow<'b, str> {
                 // POSIX lex has two layers of escaping, there are escapes for the regular
                 // expressions themselves and the escapes which get handled by lex directly.
                 // We can find what the `regex` crate needs to be escaped with `is_meta_character`.
@@ -541,7 +541,11 @@ where
                 'outer: while let Some((i, s, j, c)) = cursor {
                     if c == 'b' {
                         unescaped.push_str(&re_str[last_pos..i]);
-                        unescaped.push_str("\\x08");
+                        unescaped.push_str(if regex_options.posix_escapes {
+                            "\\x08"
+                        } else {
+                            "\\b"
+                        });
                         last_pos = j + 1;
                     } else if regex_syntax::is_meta_character(c) || RE_LEX_ESC_LITERAL.is_match(s) {
                         // For both meta characters and literals we want to push the entire substring
@@ -574,7 +578,7 @@ where
                 Cow::from(unescaped)
             }
 
-            Ok((vec![], unescape(Cow::from(re_str))))
+            Ok((vec![], unescape(Cow::from(re_str), &self.regex_options)))
         } else {
             match re_str.find('>') {
                 None => Err(self.mk_error(LexErrorKind::InvalidStartState, off)),