Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions lrlex/src/lib/ctbuilder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -443,6 +443,7 @@ pub fn lexerdef() -> {lexerdef_type} {{
dot_matches_new_line: {dot_matches_new_line:?},
multi_line: {multi_line:?},
octal: {octal:?},
posix_escapes: {posix_escapes:?},
case_insensitive: {case_insensitive:?},
unicode: {unicode:?},
swap_greed: {swap_greed:?},
Expand All @@ -454,6 +455,7 @@ pub fn lexerdef() -> {lexerdef_type} {{
dot_matches_new_line = self.regex_options.dot_matches_new_line,
multi_line = self.regex_options.multi_line,
octal = self.regex_options.octal,
posix_escapes = self.regex_options.posix_escapes,
case_insensitive = self.regex_options.case_insensitive,
unicode = self.regex_options.unicode,
swap_greed = self.regex_options.swap_greed,
Expand Down Expand Up @@ -668,6 +670,13 @@ pub fn lexerdef() -> {lexerdef_type} {{
self
}

/// Sets the `regex::RegexBuilder` option of the same name.
/// The default value is `false`.
pub fn posix_escapes(mut self, flag: bool) -> Self {
self.regex_options.posix_escapes = flag;
self
}

/// Sets the `regex::RegexBuilder` option of the same name.
/// The default value is `true`.
pub fn octal(mut self, flag: bool) -> Self {
Expand Down
64 changes: 64 additions & 0 deletions lrlex/src/lib/lexer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ pub struct RegexOptions {
pub dot_matches_new_line: bool,
pub multi_line: bool,
pub octal: bool,
pub posix_escapes: bool,
pub case_insensitive: Option<bool>,
pub swap_greed: Option<bool>,
pub ignore_whitespace: Option<bool>,
Expand All @@ -38,6 +39,7 @@ pub const DEFAULT_REGEX_OPTIONS: RegexOptions = RegexOptions {
dot_matches_new_line: true,
multi_line: true,
octal: true,
posix_escapes: false,
case_insensitive: None,
ignore_whitespace: None,
swap_greed: None,
Expand Down Expand Up @@ -649,6 +651,68 @@ mod test {
assert_eq!(lex2.span().len(), 3);
}

#[test]
fn test_posix_escapes() {
let src = r#"%%
\\ 'slash'
\a 'alert'
\b 'backspace'
\f 'feed'
\n 'newline'
\r 'return'
\t 'tab'
\v 'vtab'
\q 'normal_char'
"#
.to_string();
let mut options = DEFAULT_REGEX_OPTIONS;
options.posix_escapes = true;
let lexerdef =
LRNonStreamingLexerDef::<DefaultLexerTypes<u8>>::new_with_options(&src, options)
.unwrap();
let lexemes = lexerdef
.lexer("\\\x07\x08\x0c\n\r\t\x0bq")
.iter()
.map(|x| x.unwrap())
.collect::<Vec<_>>();
assert_eq!(lexemes.len(), 9);
for i in 0..9u8 {
let lexeme = lexemes[i as usize];
assert_eq!(lexeme.tok_id(), i);
}
}

#[test]
fn test_non_posix_escapes() {
let src = r#"%%
\\ 'slash'
\a 'alert'
a\b a 'work_break'
\f 'feed'
\n 'newline'
\r 'return'
\t 'tab'
\v 'vtab'
\q 'normal_char'
"#
.to_string();
let mut options = DEFAULT_REGEX_OPTIONS;
options.posix_escapes = false;
let lexerdef =
LRNonStreamingLexerDef::<DefaultLexerTypes<u8>>::new_with_options(&src, options)
.unwrap();
let lexemes = lexerdef
.lexer("\\\x07a a\x0c\n\r\t\x0bq")
.iter()
.map(|x| x.unwrap())
.collect::<Vec<_>>();
assert_eq!(lexemes.len(), 9);
for i in 0..9u8 {
let lexeme = lexemes[i as usize];
assert_eq!(lexeme.tok_id(), i);
}
}

#[test]
fn test_basic_error() {
let src = "
Expand Down
18 changes: 13 additions & 5 deletions lrlex/src/lib/parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ lazy_static! {
Regex::new(r"^%[xX][a-zA-Z0-9]*$").unwrap();
// Documented in `Escape sequences` in lexcompatibility.m
static ref RE_LEX_ESC_LITERAL: Regex =
Regex::new(r"^(([xuU][[:xdigit:]])|[[:digit:]]|[afnrtv\\]|[pP]|[dDsSwW]|[AbBz])").unwrap();
Regex::new(r"^(([xuU][[:xdigit:]])|[[:digit:]]|[afnrtv\\]|[pP]|[dDsSwW]|[Az])").unwrap();
// Vertical line separators.
static ref RE_LINE_SEP: Regex = Regex::new(r"[\p{Pattern_White_Space}&&[\p{Zl}\p{Zp}\n\r\v]]").unwrap();
static ref RE_LEADING_LINE_SEPS: Regex = Regex::new(r"^[\p{Pattern_White_Space}&&[\p{Zl}\p{Zp}\n\r\v]]*").unwrap();
Expand Down Expand Up @@ -490,7 +490,7 @@ where
/// XBD File Format Notation ( '\\', '\a', '\b', '\f' , '\n', '\r', '\t', '\v' ).
///
/// Meaning: The character 'c', unchanged.
fn unescape(re: Cow<str>) -> Cow<str> {
fn unescape<'b>(re: Cow<'b, str>, regex_options: &'_ RegexOptions) -> Cow<'b, str> {
// POSIX lex has two layers of escaping, there are escapes for the regular
// expressions themselves and the escapes which get handled by lex directly.
// We can find what the `regex` crate needs to be escaped with `is_meta_character`.
Expand Down Expand Up @@ -520,7 +520,7 @@ where
if !(regex_syntax::is_meta_character(c2)
|| RE_LEX_ESC_LITERAL.is_match(s))
{
break (Some((i, s, j, c2)));
break Some((i, s, j, c2));
}
}
}
Expand All @@ -539,7 +539,15 @@ where
let mut last_pos = 0;

'outer: while let Some((i, s, j, c)) = cursor {
if regex_syntax::is_meta_character(c) || RE_LEX_ESC_LITERAL.is_match(s) {
if c == 'b' {
unescaped.push_str(&re_str[last_pos..i]);
unescaped.push_str(if regex_options.posix_escapes {
"\\x08"
} else {
"\\b"
});
last_pos = j + 1;
} else if regex_syntax::is_meta_character(c) || RE_LEX_ESC_LITERAL.is_match(s) {
// For both meta characters and literals we want to push the entire substring
// up to and including the c match back into the string still escaped.
unescaped.push_str(&re_str[last_pos..j + c.len_utf8()]);
Expand Down Expand Up @@ -570,7 +578,7 @@ where
Cow::from(unescaped)
}

Ok((vec![], unescape(Cow::from(re_str))))
Ok((vec![], unescape(Cow::from(re_str), &self.regex_options)))
} else {
match re_str.find('>') {
None => Err(self.mk_error(LexErrorKind::InvalidStartState, off)),
Expand Down