From b2b65ec35f3bc84dd75081d73f9ef575d91cb39d Mon Sep 17 00:00:00 2001 From: Owen Shepherd Date: Mon, 27 Jan 2025 22:36:50 +0000 Subject: [PATCH] Add option for posix-lex-compatible regex escapes In posix lex, '\b' represents the backspace character. In the rust 'regex' crate, it represents a word boundary assertion. This patch adds an option to maintain posix-lex semantics, and tests that all escapes are interpreted correctly, under both sets of semantics. --- lrlex/src/lib/ctbuilder.rs | 9 ++++++ lrlex/src/lib/lexer.rs | 64 ++++++++++++++++++++++++++++++++++++++ lrlex/src/lib/parser.rs | 18 ++++++++--- 3 files changed, 86 insertions(+), 5 deletions(-) diff --git a/lrlex/src/lib/ctbuilder.rs b/lrlex/src/lib/ctbuilder.rs index a70bd0167..90ac46b8c 100644 --- a/lrlex/src/lib/ctbuilder.rs +++ b/lrlex/src/lib/ctbuilder.rs @@ -443,6 +443,7 @@ pub fn lexerdef() -> {lexerdef_type} {{ dot_matches_new_line: {dot_matches_new_line:?}, multi_line: {multi_line:?}, octal: {octal:?}, + posix_escapes: {posix_escapes:?}, case_insensitive: {case_insensitive:?}, unicode: {unicode:?}, swap_greed: {swap_greed:?}, @@ -454,6 +455,7 @@ pub fn lexerdef() -> {lexerdef_type} {{ dot_matches_new_line = self.regex_options.dot_matches_new_line, multi_line = self.regex_options.multi_line, octal = self.regex_options.octal, + posix_escapes = self.regex_options.posix_escapes, case_insensitive = self.regex_options.case_insensitive, unicode = self.regex_options.unicode, swap_greed = self.regex_options.swap_greed, @@ -668,6 +670,13 @@ pub fn lexerdef() -> {lexerdef_type} {{ self } + /// Sets the `regex::RegexBuilder` option of the same name. + /// The default value is `false`. + pub fn posix_escapes(mut self, flag: bool) -> Self { + self.regex_options.posix_escapes = flag; + self + } + /// Sets the `regex::RegexBuilder` option of the same name. /// The default value is `true`. pub fn octal(mut self, flag: bool) -> Self { diff --git a/lrlex/src/lib/lexer.rs b/lrlex/src/lib/lexer.rs index 10afa0277..8b39807b6 100644 --- a/lrlex/src/lib/lexer.rs +++ b/lrlex/src/lib/lexer.rs @@ -25,6 +25,7 @@ pub struct RegexOptions { pub dot_matches_new_line: bool, pub multi_line: bool, pub octal: bool, + pub posix_escapes: bool, pub case_insensitive: Option, pub swap_greed: Option, pub ignore_whitespace: Option, @@ -38,6 +39,7 @@ pub const DEFAULT_REGEX_OPTIONS: RegexOptions = RegexOptions { dot_matches_new_line: true, multi_line: true, octal: true, + posix_escapes: false, case_insensitive: None, ignore_whitespace: None, swap_greed: None, @@ -649,6 +651,68 @@ mod test { assert_eq!(lex2.span().len(), 3); } + #[test] + fn test_posix_escapes() { + let src = r#"%% +\\ 'slash' +\a 'alert' +\b 'backspace' +\f 'feed' +\n 'newline' +\r 'return' +\t 'tab' +\v 'vtab' +\q 'normal_char' +"# + .to_string(); + let mut options = DEFAULT_REGEX_OPTIONS; + options.posix_escapes = true; + let lexerdef = + LRNonStreamingLexerDef::>::new_with_options(&src, options) + .unwrap(); + let lexemes = lexerdef + .lexer("\\\x07\x08\x0c\n\r\t\x0bq") + .iter() + .map(|x| x.unwrap()) + .collect::>(); + assert_eq!(lexemes.len(), 9); + for i in 0..9u8 { + let lexeme = lexemes[i as usize]; + assert_eq!(lexeme.tok_id(), i); + } + } + + #[test] + fn test_non_posix_escapes() { + let src = r#"%% +\\ 'slash' +\a 'alert' +a\b a 'work_break' +\f 'feed' +\n 'newline' +\r 'return' +\t 'tab' +\v 'vtab' +\q 'normal_char' +"# + .to_string(); + let mut options = DEFAULT_REGEX_OPTIONS; + options.posix_escapes = false; + let lexerdef = + LRNonStreamingLexerDef::>::new_with_options(&src, options) + .unwrap(); + let lexemes = lexerdef + .lexer("\\\x07a a\x0c\n\r\t\x0bq") + .iter() + .map(|x| x.unwrap()) + .collect::>(); + assert_eq!(lexemes.len(), 9); + for i in 0..9u8 { + let lexeme = lexemes[i as usize]; + assert_eq!(lexeme.tok_id(), i); + } + } + #[test] fn test_basic_error() { let src = " diff --git a/lrlex/src/lib/parser.rs b/lrlex/src/lib/parser.rs index b521e696d..71b15258f 100644 --- a/lrlex/src/lib/parser.rs +++ b/lrlex/src/lib/parser.rs @@ -21,7 +21,7 @@ lazy_static! { Regex::new(r"^%[xX][a-zA-Z0-9]*$").unwrap(); // Documented in `Escape sequences` in lexcompatibility.m static ref RE_LEX_ESC_LITERAL: Regex = - Regex::new(r"^(([xuU][[:xdigit:]])|[[:digit:]]|[afnrtv\\]|[pP]|[dDsSwW]|[AbBz])").unwrap(); + Regex::new(r"^(([xuU][[:xdigit:]])|[[:digit:]]|[afnrtv\\]|[pP]|[dDsSwW]|[Az])").unwrap(); // Vertical line separators. static ref RE_LINE_SEP: Regex = Regex::new(r"[\p{Pattern_White_Space}&&[\p{Zl}\p{Zp}\n\r\v]]").unwrap(); static ref RE_LEADING_LINE_SEPS: Regex = Regex::new(r"^[\p{Pattern_White_Space}&&[\p{Zl}\p{Zp}\n\r\v]]*").unwrap(); @@ -490,7 +490,7 @@ where /// XBD File Format Notation ( '\\', '\a', '\b', '\f' , '\n', '\r', '\t', '\v' ). /// /// Meaning: The character 'c', unchanged. - fn unescape(re: Cow) -> Cow { + fn unescape<'b>(re: Cow<'b, str>, regex_options: &'_ RegexOptions) -> Cow<'b, str> { // POSIX lex has two layers of escaping, there are escapes for the regular // expressions themselves and the escapes which get handled by lex directly. // We can find what the `regex` crate needs to be escaped with `is_meta_character`. @@ -520,7 +520,7 @@ where if !(regex_syntax::is_meta_character(c2) || RE_LEX_ESC_LITERAL.is_match(s)) { - break (Some((i, s, j, c2))); + break Some((i, s, j, c2)); } } } @@ -539,7 +539,15 @@ where let mut last_pos = 0; 'outer: while let Some((i, s, j, c)) = cursor { - if regex_syntax::is_meta_character(c) || RE_LEX_ESC_LITERAL.is_match(s) { + if c == 'b' { + unescaped.push_str(&re_str[last_pos..i]); + unescaped.push_str(if regex_options.posix_escapes { + "\\x08" + } else { + "\\b" + }); + last_pos = j + 1; + } else if regex_syntax::is_meta_character(c) || RE_LEX_ESC_LITERAL.is_match(s) { // For both meta characters and literals we want to push the entire substring // up to and including the c match back into the string still escaped. unescaped.push_str(&re_str[last_pos..j + c.len_utf8()]); @@ -570,7 +578,7 @@ where Cow::from(unescaped) } - Ok((vec![], unescape(Cow::from(re_str)))) + Ok((vec![], unescape(Cow::from(re_str), &self.regex_options))) } else { match re_str.find('>') { None => Err(self.mk_error(LexErrorKind::InvalidStartState, off)),