Skip to content

Commit b2b65ec

Browse files
committed
Add option for posix-lex-compatible regex escapes
In posix lex, '\b' represents the backspace character. In the rust 'regex' crate, it represents a word boundary assertion. This patch adds an option to maintain posix-lex semantics, and tests that all escapes are interpreted correctly, under both sets of semantics.
1 parent 6967d50 commit b2b65ec

File tree

3 files changed

+86
-5
lines changed

3 files changed

+86
-5
lines changed

lrlex/src/lib/ctbuilder.rs

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -443,6 +443,7 @@ pub fn lexerdef() -> {lexerdef_type} {{
443443
dot_matches_new_line: {dot_matches_new_line:?},
444444
multi_line: {multi_line:?},
445445
octal: {octal:?},
446+
posix_escapes: {posix_escapes:?},
446447
case_insensitive: {case_insensitive:?},
447448
unicode: {unicode:?},
448449
swap_greed: {swap_greed:?},
@@ -454,6 +455,7 @@ pub fn lexerdef() -> {lexerdef_type} {{
454455
dot_matches_new_line = self.regex_options.dot_matches_new_line,
455456
multi_line = self.regex_options.multi_line,
456457
octal = self.regex_options.octal,
458+
posix_escapes = self.regex_options.posix_escapes,
457459
case_insensitive = self.regex_options.case_insensitive,
458460
unicode = self.regex_options.unicode,
459461
swap_greed = self.regex_options.swap_greed,
@@ -668,6 +670,13 @@ pub fn lexerdef() -> {lexerdef_type} {{
668670
self
669671
}
670672

673+
/// Sets the `regex::RegexBuilder` option of the same name.
674+
/// The default value is `false`.
675+
pub fn posix_escapes(mut self, flag: bool) -> Self {
676+
self.regex_options.posix_escapes = flag;
677+
self
678+
}
679+
671680
/// Sets the `regex::RegexBuilder` option of the same name.
672681
/// The default value is `true`.
673682
pub fn octal(mut self, flag: bool) -> Self {

lrlex/src/lib/lexer.rs

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ pub struct RegexOptions {
2525
pub dot_matches_new_line: bool,
2626
pub multi_line: bool,
2727
pub octal: bool,
28+
pub posix_escapes: bool,
2829
pub case_insensitive: Option<bool>,
2930
pub swap_greed: Option<bool>,
3031
pub ignore_whitespace: Option<bool>,
@@ -38,6 +39,7 @@ pub const DEFAULT_REGEX_OPTIONS: RegexOptions = RegexOptions {
3839
dot_matches_new_line: true,
3940
multi_line: true,
4041
octal: true,
42+
posix_escapes: false,
4143
case_insensitive: None,
4244
ignore_whitespace: None,
4345
swap_greed: None,
@@ -649,6 +651,68 @@ mod test {
649651
assert_eq!(lex2.span().len(), 3);
650652
}
651653

654+
#[test]
655+
fn test_posix_escapes() {
656+
let src = r#"%%
657+
\\ 'slash'
658+
\a 'alert'
659+
\b 'backspace'
660+
\f 'feed'
661+
\n 'newline'
662+
\r 'return'
663+
\t 'tab'
664+
\v 'vtab'
665+
\q 'normal_char'
666+
"#
667+
.to_string();
668+
let mut options = DEFAULT_REGEX_OPTIONS;
669+
options.posix_escapes = true;
670+
let lexerdef =
671+
LRNonStreamingLexerDef::<DefaultLexerTypes<u8>>::new_with_options(&src, options)
672+
.unwrap();
673+
let lexemes = lexerdef
674+
.lexer("\\\x07\x08\x0c\n\r\t\x0bq")
675+
.iter()
676+
.map(|x| x.unwrap())
677+
.collect::<Vec<_>>();
678+
assert_eq!(lexemes.len(), 9);
679+
for i in 0..9u8 {
680+
let lexeme = lexemes[i as usize];
681+
assert_eq!(lexeme.tok_id(), i);
682+
}
683+
}
684+
685+
#[test]
686+
fn test_non_posix_escapes() {
687+
let src = r#"%%
688+
\\ 'slash'
689+
\a 'alert'
690+
a\b a 'work_break'
691+
\f 'feed'
692+
\n 'newline'
693+
\r 'return'
694+
\t 'tab'
695+
\v 'vtab'
696+
\q 'normal_char'
697+
"#
698+
.to_string();
699+
let mut options = DEFAULT_REGEX_OPTIONS;
700+
options.posix_escapes = false;
701+
let lexerdef =
702+
LRNonStreamingLexerDef::<DefaultLexerTypes<u8>>::new_with_options(&src, options)
703+
.unwrap();
704+
let lexemes = lexerdef
705+
.lexer("\\\x07a a\x0c\n\r\t\x0bq")
706+
.iter()
707+
.map(|x| x.unwrap())
708+
.collect::<Vec<_>>();
709+
assert_eq!(lexemes.len(), 9);
710+
for i in 0..9u8 {
711+
let lexeme = lexemes[i as usize];
712+
assert_eq!(lexeme.tok_id(), i);
713+
}
714+
}
715+
652716
#[test]
653717
fn test_basic_error() {
654718
let src = "

lrlex/src/lib/parser.rs

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ lazy_static! {
2121
Regex::new(r"^%[xX][a-zA-Z0-9]*$").unwrap();
2222
// Documented in `Escape sequences` in lexcompatibility.m
2323
static ref RE_LEX_ESC_LITERAL: Regex =
24-
Regex::new(r"^(([xuU][[:xdigit:]])|[[:digit:]]|[afnrtv\\]|[pP]|[dDsSwW]|[AbBz])").unwrap();
24+
Regex::new(r"^(([xuU][[:xdigit:]])|[[:digit:]]|[afnrtv\\]|[pP]|[dDsSwW]|[Az])").unwrap();
2525
// Vertical line separators.
2626
static ref RE_LINE_SEP: Regex = Regex::new(r"[\p{Pattern_White_Space}&&[\p{Zl}\p{Zp}\n\r\v]]").unwrap();
2727
static ref RE_LEADING_LINE_SEPS: Regex = Regex::new(r"^[\p{Pattern_White_Space}&&[\p{Zl}\p{Zp}\n\r\v]]*").unwrap();
@@ -490,7 +490,7 @@ where
490490
/// XBD File Format Notation ( '\\', '\a', '\b', '\f' , '\n', '\r', '\t', '\v' ).
491491
///
492492
/// Meaning: The character 'c', unchanged.
493-
fn unescape(re: Cow<str>) -> Cow<str> {
493+
fn unescape<'b>(re: Cow<'b, str>, regex_options: &'_ RegexOptions) -> Cow<'b, str> {
494494
// POSIX lex has two layers of escaping, there are escapes for the regular
495495
// expressions themselves and the escapes which get handled by lex directly.
496496
// We can find what the `regex` crate needs to be escaped with `is_meta_character`.
@@ -520,7 +520,7 @@ where
520520
if !(regex_syntax::is_meta_character(c2)
521521
|| RE_LEX_ESC_LITERAL.is_match(s))
522522
{
523-
break (Some((i, s, j, c2)));
523+
break Some((i, s, j, c2));
524524
}
525525
}
526526
}
@@ -539,7 +539,15 @@ where
539539
let mut last_pos = 0;
540540

541541
'outer: while let Some((i, s, j, c)) = cursor {
542-
if regex_syntax::is_meta_character(c) || RE_LEX_ESC_LITERAL.is_match(s) {
542+
if c == 'b' {
543+
unescaped.push_str(&re_str[last_pos..i]);
544+
unescaped.push_str(if regex_options.posix_escapes {
545+
"\\x08"
546+
} else {
547+
"\\b"
548+
});
549+
last_pos = j + 1;
550+
} else if regex_syntax::is_meta_character(c) || RE_LEX_ESC_LITERAL.is_match(s) {
543551
// For both meta characters and literals we want to push the entire substring
544552
// up to and including the c match back into the string still escaped.
545553
unescaped.push_str(&re_str[last_pos..j + c.len_utf8()]);
@@ -570,7 +578,7 @@ where
570578
Cow::from(unescaped)
571579
}
572580

573-
Ok((vec![], unescape(Cow::from(re_str))))
581+
Ok((vec![], unescape(Cow::from(re_str), &self.regex_options)))
574582
} else {
575583
match re_str.find('>') {
576584
None => Err(self.mk_error(LexErrorKind::InvalidStartState, off)),

0 commit comments

Comments
 (0)