Skip to content

Commit 15b2bd4

Browse files
committed
Fix backspace lexeme escaping
In posix lex, '\b' represents the backspace character. In the rust 'regex' crate, it represents a word boundary assertion. This patch adds a test that all posix escapes are interpreted correctly, and a fix for the backspace escape incongruity.
1 parent 6967d50 commit 15b2bd4

File tree

2 files changed

+34
-3
lines changed

2 files changed

+34
-3
lines changed

lrlex/src/lib/lexer.rs

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -649,6 +649,33 @@ mod test {
649649
assert_eq!(lex2.span().len(), 3);
650650
}
651651

652+
#[test]
653+
fn test_escapes() {
654+
let src = r#"%%
655+
\\ 'slash'
656+
\a 'alert'
657+
\b 'backspace'
658+
\f 'feed'
659+
\n 'newline'
660+
\r 'return'
661+
\t 'tab'
662+
\v 'vtab'
663+
\q 'normal_char'
664+
"#
665+
.to_string();
666+
let lexerdef = LRNonStreamingLexerDef::<DefaultLexerTypes<u8>>::from_str(&src).unwrap();
667+
let lexemes = lexerdef
668+
.lexer("\\\x07\x08\x0c\n\r\t\x0bq")
669+
.iter()
670+
.map(|x| x.unwrap())
671+
.collect::<Vec<_>>();
672+
assert_eq!(lexemes.len(), 9);
673+
for i in 0..9u8 {
674+
let lexeme = lexemes[i as usize];
675+
assert_eq!(lexeme.tok_id(), i);
676+
}
677+
}
678+
652679
#[test]
653680
fn test_basic_error() {
654681
let src = "

lrlex/src/lib/parser.rs

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ lazy_static! {
2121
Regex::new(r"^%[xX][a-zA-Z0-9]*$").unwrap();
2222
// Documented in `Escape sequences` in lexcompatibility.m
2323
static ref RE_LEX_ESC_LITERAL: Regex =
24-
Regex::new(r"^(([xuU][[:xdigit:]])|[[:digit:]]|[afnrtv\\]|[pP]|[dDsSwW]|[AbBz])").unwrap();
24+
Regex::new(r"^(([xuU][[:xdigit:]])|[[:digit:]]|[afnrtv\\]|[pP]|[dDsSwW]|[Az])").unwrap();
2525
// Vertical line separators.
2626
static ref RE_LINE_SEP: Regex = Regex::new(r"[\p{Pattern_White_Space}&&[\p{Zl}\p{Zp}\n\r\v]]").unwrap();
2727
static ref RE_LEADING_LINE_SEPS: Regex = Regex::new(r"^[\p{Pattern_White_Space}&&[\p{Zl}\p{Zp}\n\r\v]]*").unwrap();
@@ -520,7 +520,7 @@ where
520520
if !(regex_syntax::is_meta_character(c2)
521521
|| RE_LEX_ESC_LITERAL.is_match(s))
522522
{
523-
break (Some((i, s, j, c2)));
523+
break Some((i, s, j, c2));
524524
}
525525
}
526526
}
@@ -539,7 +539,11 @@ where
539539
let mut last_pos = 0;
540540

541541
'outer: while let Some((i, s, j, c)) = cursor {
542-
if regex_syntax::is_meta_character(c) || RE_LEX_ESC_LITERAL.is_match(s) {
542+
if c == 'b' {
543+
unescaped.push_str(&re_str[last_pos..i]);
544+
unescaped.push_str("\\x08");
545+
last_pos = j + 1;
546+
} else if regex_syntax::is_meta_character(c) || RE_LEX_ESC_LITERAL.is_match(s) {
543547
// For both meta characters and literals we want to push the entire substring
544548
// up to and including the c match back into the string still escaped.
545549
unescaped.push_str(&re_str[last_pos..j + c.len_utf8()]);

0 commit comments

Comments
 (0)