Skip to content

Commit c3b6c27

Browse files
committed
Add posix_escapes to RegexOptions
This makes the posix-lex-compatible '\b'-escape change backwards- compatible, by putting it behind a defaulted-to-off flag.
1 parent 15b2bd4 commit c3b6c27

File tree

3 files changed

+55
-5
lines changed

3 files changed

+55
-5
lines changed

lrlex/src/lib/ctbuilder.rs

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -443,6 +443,7 @@ pub fn lexerdef() -> {lexerdef_type} {{
443443
dot_matches_new_line: {dot_matches_new_line:?},
444444
multi_line: {multi_line:?},
445445
octal: {octal:?},
446+
posix_escapes: {posix_escapes:?},
446447
case_insensitive: {case_insensitive:?},
447448
unicode: {unicode:?},
448449
swap_greed: {swap_greed:?},
@@ -454,6 +455,7 @@ pub fn lexerdef() -> {lexerdef_type} {{
454455
dot_matches_new_line = self.regex_options.dot_matches_new_line,
455456
multi_line = self.regex_options.multi_line,
456457
octal = self.regex_options.octal,
458+
posix_escapes = self.regex_options.posix_escapes,
457459
case_insensitive = self.regex_options.case_insensitive,
458460
unicode = self.regex_options.unicode,
459461
swap_greed = self.regex_options.swap_greed,
@@ -668,6 +670,13 @@ pub fn lexerdef() -> {lexerdef_type} {{
668670
self
669671
}
670672

673+
/// Sets the `regex::RegexBuilder` option of the same name.
674+
/// The default value is `false`.
675+
pub fn posix_escapes(mut self, flag: bool) -> Self {
676+
self.regex_options.posix_escapes = flag;
677+
self
678+
}
679+
671680
/// Sets the `regex::RegexBuilder` option of the same name.
672681
/// The default value is `true`.
673682
pub fn octal(mut self, flag: bool) -> Self {

lrlex/src/lib/lexer.rs

Lines changed: 39 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ pub struct RegexOptions {
2525
pub dot_matches_new_line: bool,
2626
pub multi_line: bool,
2727
pub octal: bool,
28+
pub posix_escapes: bool,
2829
pub case_insensitive: Option<bool>,
2930
pub swap_greed: Option<bool>,
3031
pub ignore_whitespace: Option<bool>,
@@ -38,6 +39,7 @@ pub const DEFAULT_REGEX_OPTIONS: RegexOptions = RegexOptions {
3839
dot_matches_new_line: true,
3940
multi_line: true,
4041
octal: true,
42+
posix_escapes: false,
4143
case_insensitive: None,
4244
ignore_whitespace: None,
4345
swap_greed: None,
@@ -650,7 +652,7 @@ mod test {
650652
}
651653

652654
#[test]
653-
fn test_escapes() {
655+
fn test_posix_escapes() {
654656
let src = r#"%%
655657
\\ 'slash'
656658
\a 'alert'
@@ -663,7 +665,11 @@ mod test {
663665
\q 'normal_char'
664666
"#
665667
.to_string();
666-
let lexerdef = LRNonStreamingLexerDef::<DefaultLexerTypes<u8>>::from_str(&src).unwrap();
668+
let mut options = DEFAULT_REGEX_OPTIONS;
669+
options.posix_escapes = true;
670+
let lexerdef =
671+
LRNonStreamingLexerDef::<DefaultLexerTypes<u8>>::new_with_options(&src, options)
672+
.unwrap();
667673
let lexemes = lexerdef
668674
.lexer("\\\x07\x08\x0c\n\r\t\x0bq")
669675
.iter()
@@ -676,6 +682,37 @@ mod test {
676682
}
677683
}
678684

685+
#[test]
686+
fn test_non_posix_escapes() {
687+
let src = r#"%%
688+
\\ 'slash'
689+
\a 'alert'
690+
a\b a 'work_break'
691+
\f 'feed'
692+
\n 'newline'
693+
\r 'return'
694+
\t 'tab'
695+
\v 'vtab'
696+
\q 'normal_char'
697+
"#
698+
.to_string();
699+
let mut options = DEFAULT_REGEX_OPTIONS;
700+
options.posix_escapes = false;
701+
let lexerdef =
702+
LRNonStreamingLexerDef::<DefaultLexerTypes<u8>>::new_with_options(&src, options)
703+
.unwrap();
704+
let lexemes = lexerdef
705+
.lexer("\\\x07a a\x0c\n\r\t\x0bq")
706+
.iter()
707+
.map(|x| x.unwrap())
708+
.collect::<Vec<_>>();
709+
assert_eq!(lexemes.len(), 9);
710+
for i in 0..9u8 {
711+
let lexeme = lexemes[i as usize];
712+
assert_eq!(lexeme.tok_id(), i);
713+
}
714+
}
715+
679716
#[test]
680717
fn test_basic_error() {
681718
let src = "

lrlex/src/lib/parser.rs

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -490,7 +490,7 @@ where
490490
/// XBD File Format Notation ( '\\', '\a', '\b', '\f' , '\n', '\r', '\t', '\v' ).
491491
///
492492
/// Meaning: The character 'c', unchanged.
493-
fn unescape(re: Cow<str>) -> Cow<str> {
493+
fn unescape<'a, 'b>(re: Cow<'b, str>, regex_options: &'a RegexOptions) -> Cow<'b, str> {
494494
// POSIX lex has two layers of escaping, there are escapes for the regular
495495
// expressions themselves and the escapes which get handled by lex directly.
496496
// We can find what the `regex` crate needs to be escaped with `is_meta_character`.
@@ -541,7 +541,11 @@ where
541541
'outer: while let Some((i, s, j, c)) = cursor {
542542
if c == 'b' {
543543
unescaped.push_str(&re_str[last_pos..i]);
544-
unescaped.push_str("\\x08");
544+
unescaped.push_str(if regex_options.posix_escapes {
545+
"\\x08"
546+
} else {
547+
"\\b"
548+
});
545549
last_pos = j + 1;
546550
} else if regex_syntax::is_meta_character(c) || RE_LEX_ESC_LITERAL.is_match(s) {
547551
// For both meta characters and literals we want to push the entire substring
@@ -574,7 +578,7 @@ where
574578
Cow::from(unescaped)
575579
}
576580

577-
Ok((vec![], unescape(Cow::from(re_str))))
581+
Ok((vec![], unescape(Cow::from(re_str), &self.regex_options)))
578582
} else {
579583
match re_str.find('>') {
580584
None => Err(self.mk_error(LexErrorKind::InvalidStartState, off)),

0 commit comments

Comments
 (0)