Add lexing of rawstring (#48)

WindSoilder · web-flow · commit fca3632a5304 · 2025-01-27T20:04:41.000+02:00
As title. This pr is going to make nushell support lexing of rawstring,
which have the syntax:
- `r#'&lt;content&gt;'#`
- `r##'&lt;content&gt;'##`
- `r###'&lt;content&gt;'###`
- etc...

To implement this, just need to move forward to find the ending mark of
the raw string.
diff --git a/src/lexer.rs b/src/lexer.rs
@@ -7,6 +7,7 @@ pub enum LexError {
     Generic,
     UnmatchedStrInterpLParen,
     UnmatchedStrInterpRParen,
+    UnmatchedRawStringRSharp,
 }
 
 /// Average number of bytes per token used for estimating the tokens buffer size.
@@ -279,6 +280,32 @@ pub fn lex(contents: &[u8], span_offset: usize) -> (Tokens, Result<(), Spanned<L
     (tokens, Ok(()))
 }
 
+fn match_rawstring(remainder: &[u8], lexer: &mut Lexer<Token>) -> Result<(), LexError> {
+    let prefix = lexer.slice();
+    let prefix_sharp_length = prefix[1..prefix.len() - 1].len(); // without first `r` and last `'`
+    let mut pos = 0;
+
+    while pos < remainder.len() {
+        if remainder[pos] == b'\'' {
+            // might be ending of raw string like '##, move forward and check.
+            pos += 1;
+            let mut postfix_sharp_length = 0;
+            while pos < remainder.len() && remainder[pos] == b'#' {
+                pos += 1;
+                postfix_sharp_length += 1;
+                if postfix_sharp_length == prefix_sharp_length {
+                    // found a matched raw string.
+                    lexer.bump(pos);
+                    return Ok(());
+                }
+            }
+        } else {
+            pos += 1;
+        }
+    }
+    Err(LexError::UnmatchedRawStringRSharp)
+}
+
 #[derive(Logos, Debug, Clone, Copy, PartialEq)]
 #[logos(skip r"[ \t]+")]
 #[logos(source = [u8], error = LexError)]
@@ -295,6 +322,8 @@ pub enum Token {
     SingleQuotedString,
     #[regex(r#"`[^`]*`"#)]
     BacktickBareword,
+    #[regex("r#+'", |lex| match_rawstring(lex.remainder(), lex))]
+    RawString,
     // #[regex(r#"[ \t]+"#)]
     // HorizontalWhitespace,
     #[regex(r#"[0-9]{4}-[0-9]{2}-[0-9]{2}(T[0-9]{2}:[0-9]{2}:[0-9]{2}(\.[0-9]+)?)?(Z|[\+-][0-9]{2}:[0-9]{2})?"#)]
diff --git a/src/snapshots/new_nu_parser__test__lexer@raw_string.nu.snap b/src/snapshots/new_nu_parser__test__lexer@raw_string.nu.snap
@@ -0,0 +1,14 @@
+---
+source: src/test.rs
+expression: evaluate_lexer(path)
+input_file: tests/lex/raw_string.nu
+snapshot_kind: text
+---
+==== TOKENS ====
+Token3    0: RawString                 span:    0 ..    9 'r#'aabb'#'
+Token3    1: Newline                   span:    9 ..   10 '\n'
+Token3    2: RawString                 span:   10 ..   25 'r##'aa\n'#\nbb'##'
+Token3    3: Newline                   span:   25 ..   26 '\n'
+Token3    4: RawString                 span:   26 ..   58 'r####'aa\nbb\ncc'##dd\n###\nddd'####'
+Token3    5: Newline                   span:   58 ..   59 '\n'
+Token3    6: Eof                       span:   59 ..   59 ''
diff --git a/tests/lex/raw_string.nu b/tests/lex/raw_string.nu
@@ -0,0 +1,9 @@
+r#'aabb'#
+r##'aa
+'#
+bb'##
+r####'aa
+bb
+cc'##dd
+###
+ddd'####