Skip to content

Commit 4e939af

Browse files
authored
Merge pull request #483 from 414owen/os/fix-backspace-lexing
Fix backspace lexeme escaping
2 parents d8ea28b + b2b65ec commit 4e939af

File tree

3 files changed

+85
-4
lines changed

3 files changed

+85
-4
lines changed

lrlex/src/lib/ctbuilder.rs

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -443,6 +443,7 @@ pub fn lexerdef() -> {lexerdef_type} {{
443443
dot_matches_new_line: {dot_matches_new_line:?},
444444
multi_line: {multi_line:?},
445445
octal: {octal:?},
446+
posix_escapes: {posix_escapes:?},
446447
case_insensitive: {case_insensitive:?},
447448
unicode: {unicode:?},
448449
swap_greed: {swap_greed:?},
@@ -454,6 +455,7 @@ pub fn lexerdef() -> {lexerdef_type} {{
454455
dot_matches_new_line = self.regex_options.dot_matches_new_line,
455456
multi_line = self.regex_options.multi_line,
456457
octal = self.regex_options.octal,
458+
posix_escapes = self.regex_options.posix_escapes,
457459
case_insensitive = self.regex_options.case_insensitive,
458460
unicode = self.regex_options.unicode,
459461
swap_greed = self.regex_options.swap_greed,
@@ -668,6 +670,13 @@ pub fn lexerdef() -> {lexerdef_type} {{
668670
self
669671
}
670672

673+
/// Sets the `regex::RegexBuilder` option of the same name.
674+
/// The default value is `false`.
675+
pub fn posix_escapes(mut self, flag: bool) -> Self {
676+
self.regex_options.posix_escapes = flag;
677+
self
678+
}
679+
671680
/// Sets the `regex::RegexBuilder` option of the same name.
672681
/// The default value is `true`.
673682
pub fn octal(mut self, flag: bool) -> Self {

lrlex/src/lib/lexer.rs

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ pub struct RegexOptions {
2525
pub dot_matches_new_line: bool,
2626
pub multi_line: bool,
2727
pub octal: bool,
28+
pub posix_escapes: bool,
2829
pub case_insensitive: Option<bool>,
2930
pub swap_greed: Option<bool>,
3031
pub ignore_whitespace: Option<bool>,
@@ -38,6 +39,7 @@ pub const DEFAULT_REGEX_OPTIONS: RegexOptions = RegexOptions {
3839
dot_matches_new_line: true,
3940
multi_line: true,
4041
octal: true,
42+
posix_escapes: false,
4143
case_insensitive: None,
4244
ignore_whitespace: None,
4345
swap_greed: None,
@@ -648,6 +650,68 @@ mod test {
648650
assert_eq!(lex2.span().len(), 3);
649651
}
650652

653+
#[test]
654+
fn test_posix_escapes() {
655+
let src = r#"%%
656+
\\ 'slash'
657+
\a 'alert'
658+
\b 'backspace'
659+
\f 'feed'
660+
\n 'newline'
661+
\r 'return'
662+
\t 'tab'
663+
\v 'vtab'
664+
\q 'normal_char'
665+
"#
666+
.to_string();
667+
let mut options = DEFAULT_REGEX_OPTIONS;
668+
options.posix_escapes = true;
669+
let lexerdef =
670+
LRNonStreamingLexerDef::<DefaultLexerTypes<u8>>::new_with_options(&src, options)
671+
.unwrap();
672+
let lexemes = lexerdef
673+
.lexer("\\\x07\x08\x0c\n\r\t\x0bq")
674+
.iter()
675+
.map(|x| x.unwrap())
676+
.collect::<Vec<_>>();
677+
assert_eq!(lexemes.len(), 9);
678+
for i in 0..9u8 {
679+
let lexeme = lexemes[i as usize];
680+
assert_eq!(lexeme.tok_id(), i);
681+
}
682+
}
683+
684+
#[test]
685+
fn test_non_posix_escapes() {
686+
let src = r#"%%
687+
\\ 'slash'
688+
\a 'alert'
689+
a\b a 'work_break'
690+
\f 'feed'
691+
\n 'newline'
692+
\r 'return'
693+
\t 'tab'
694+
\v 'vtab'
695+
\q 'normal_char'
696+
"#
697+
.to_string();
698+
let mut options = DEFAULT_REGEX_OPTIONS;
699+
options.posix_escapes = false;
700+
let lexerdef =
701+
LRNonStreamingLexerDef::<DefaultLexerTypes<u8>>::new_with_options(&src, options)
702+
.unwrap();
703+
let lexemes = lexerdef
704+
.lexer("\\\x07a a\x0c\n\r\t\x0bq")
705+
.iter()
706+
.map(|x| x.unwrap())
707+
.collect::<Vec<_>>();
708+
assert_eq!(lexemes.len(), 9);
709+
for i in 0..9u8 {
710+
let lexeme = lexemes[i as usize];
711+
assert_eq!(lexeme.tok_id(), i);
712+
}
713+
}
714+
651715
#[test]
652716
fn test_basic_error() {
653717
let src = "

lrlex/src/lib/parser.rs

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ lazy_static! {
2121
Regex::new(r"^%[xX][a-zA-Z0-9]*$").unwrap();
2222
// Documented in `Escape sequences` in lexcompatibility.m
2323
static ref RE_LEX_ESC_LITERAL: Regex =
24-
Regex::new(r"^(([xuU][[:xdigit:]])|[[:digit:]]|[afnrtv\\]|[pP]|[dDsSwW]|[AbBz])").unwrap();
24+
Regex::new(r"^(([xuU][[:xdigit:]])|[[:digit:]]|[afnrtv\\]|[pP]|[dDsSwW]|[Az])").unwrap();
2525
// Vertical line separators.
2626
static ref RE_LINE_SEP: Regex = Regex::new(r"[\p{Pattern_White_Space}&&[\p{Zl}\p{Zp}\n\r\v]]").unwrap();
2727
static ref RE_LEADING_LINE_SEPS: Regex = Regex::new(r"^[\p{Pattern_White_Space}&&[\p{Zl}\p{Zp}\n\r\v]]*").unwrap();
@@ -490,7 +490,7 @@ where
490490
/// XBD File Format Notation ( '\\', '\a', '\b', '\f' , '\n', '\r', '\t', '\v' ).
491491
///
492492
/// Meaning: The character 'c', unchanged.
493-
fn unescape(re: Cow<str>) -> Cow<str> {
493+
fn unescape<'b>(re: Cow<'b, str>, regex_options: &'_ RegexOptions) -> Cow<'b, str> {
494494
// POSIX lex has two layers of escaping, there are escapes for the regular
495495
// expressions themselves and the escapes which get handled by lex directly.
496496
// We can find what the `regex` crate needs to be escaped with `is_meta_character`.
@@ -539,7 +539,15 @@ where
539539
let mut last_pos = 0;
540540

541541
'outer: while let Some((i, s, j, c)) = cursor {
542-
if regex_syntax::is_meta_character(c) || RE_LEX_ESC_LITERAL.is_match(s) {
542+
if c == 'b' {
543+
unescaped.push_str(&re_str[last_pos..i]);
544+
unescaped.push_str(if regex_options.posix_escapes {
545+
"\\x08"
546+
} else {
547+
"\\b"
548+
});
549+
last_pos = j + 1;
550+
} else if regex_syntax::is_meta_character(c) || RE_LEX_ESC_LITERAL.is_match(s) {
543551
// For both meta characters and literals we want to push the entire substring
544552
// up to and including the c match back into the string still escaped.
545553
unescaped.push_str(&re_str[last_pos..j + c.len_utf8()]);
@@ -570,7 +578,7 @@ where
570578
Cow::from(unescaped)
571579
}
572580

573-
Ok((vec![], unescape(Cow::from(re_str))))
581+
Ok((vec![], unescape(Cow::from(re_str), &self.regex_options)))
574582
} else {
575583
match re_str.find('>') {
576584
None => Err(self.mk_error(LexErrorKind::InvalidStartState, off)),

0 commit comments

Comments
 (0)