From 1618d0c2c2c62dcdd64983c47f0776f4134a825d Mon Sep 17 00:00:00 2001 From: WindSoilder Date: Tue, 21 Jan 2025 20:25:35 +0800 Subject: [PATCH] support lexing raw string --- src/lexer.rs | 29 +++++++++++++++++++ ..._nu_parser__test__lexer@raw_string.nu.snap | 14 +++++++++ tests/lex/raw_string.nu | 9 ++++++ 3 files changed, 52 insertions(+) create mode 100644 src/snapshots/new_nu_parser__test__lexer@raw_string.nu.snap create mode 100644 tests/lex/raw_string.nu diff --git a/src/lexer.rs b/src/lexer.rs index 3a60a20..b626d2a 100644 --- a/src/lexer.rs +++ b/src/lexer.rs @@ -7,6 +7,7 @@ pub enum LexError { Generic, UnmatchedStrInterpLParen, UnmatchedStrInterpRParen, + UnmatchedRawStringRSharp, } /// Average number of bytes per token used for estimating the tokens buffer size. @@ -279,6 +280,32 @@ pub fn lex(contents: &[u8], span_offset: usize) -> (Tokens, Result<(), Spanned) -> Result<(), LexError> { + let prefix = lexer.slice(); + let prefix_sharp_length = prefix[1..prefix.len() - 1].len(); // without first `r` and last `'` + let mut pos = 0; + + while pos < remainder.len() { + if remainder[pos] == b'\'' { + // might be ending of raw string like '##, move forward and check. + pos += 1; + let mut postfix_sharp_length = 0; + while pos < remainder.len() && remainder[pos] == b'#' { + pos += 1; + postfix_sharp_length += 1; + if postfix_sharp_length == prefix_sharp_length { + // found a matched raw string. + lexer.bump(pos); + return Ok(()); + } + } + } else { + pos += 1; + } + } + Err(LexError::UnmatchedRawStringRSharp) +} + #[derive(Logos, Debug, Clone, Copy, PartialEq)] #[logos(skip r"[ \t]+")] #[logos(source = [u8], error = LexError)] @@ -295,6 +322,8 @@ pub enum Token { SingleQuotedString, #[regex(r#"`[^`]*`"#)] BacktickBareword, + #[regex("r#+'", |lex| match_rawstring(lex.remainder(), lex))] + RawString, // #[regex(r#"[ \t]+"#)] // HorizontalWhitespace, #[regex(r#"[0-9]{4}-[0-9]{2}-[0-9]{2}(T[0-9]{2}:[0-9]{2}:[0-9]{2}(\.[0-9]+)?)?(Z|[\+-][0-9]{2}:[0-9]{2})?"#)] diff --git a/src/snapshots/new_nu_parser__test__lexer@raw_string.nu.snap b/src/snapshots/new_nu_parser__test__lexer@raw_string.nu.snap new file mode 100644 index 0000000..61282f1 --- /dev/null +++ b/src/snapshots/new_nu_parser__test__lexer@raw_string.nu.snap @@ -0,0 +1,14 @@ +--- +source: src/test.rs +expression: evaluate_lexer(path) +input_file: tests/lex/raw_string.nu +snapshot_kind: text +--- +==== TOKENS ==== +Token3 0: RawString span: 0 .. 9 'r#'aabb'#' +Token3 1: Newline span: 9 .. 10 '\n' +Token3 2: RawString span: 10 .. 25 'r##'aa\n'#\nbb'##' +Token3 3: Newline span: 25 .. 26 '\n' +Token3 4: RawString span: 26 .. 58 'r####'aa\nbb\ncc'##dd\n###\nddd'####' +Token3 5: Newline span: 58 .. 59 '\n' +Token3 6: Eof span: 59 .. 59 '' diff --git a/tests/lex/raw_string.nu b/tests/lex/raw_string.nu new file mode 100644 index 0000000..60ee517 --- /dev/null +++ b/tests/lex/raw_string.nu @@ -0,0 +1,9 @@ +r#'aabb'# +r##'aa +'# +bb'## +r####'aa +bb +cc'##dd +### +ddd'####