Skip to content

Commit 8255e9e

Browse files
authored
Add lexing of string interpolation (#44)
Adds lexing of string interpolations, both single- and double-quoted. It required some manual parsing, which took a few tries, but it's not too bad and can serve as a proof-of-concept if we need some more manual lexing in the Logos lexer. One limitation is that the string intepolations cannot host the same types of quotes, for example `$"foo("bar")baz"` is not allowed, but in Nushell it is allowed. It might be possible to add it, but would require more hassle, so I left it out for now as it's not a severe limitation. It also adds a LexError for reporting lexing errors. Now it is used only in string interpolation, but it can be expanded in the future.
1 parent bfa8ed6 commit 8255e9e

File tree

7 files changed

+512
-24
lines changed

7 files changed

+512
-24
lines changed

src/compiler.rs

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,18 @@ impl Span {
2525
}
2626
}
2727

28+
#[derive(Debug, PartialEq)]
29+
pub struct Spanned<T> {
30+
pub item: T,
31+
pub span: Span,
32+
}
33+
34+
impl<T> Spanned<T> {
35+
pub fn new(item: T, span: Span) -> Self {
36+
Spanned { item, span }
37+
}
38+
}
39+
2840
#[derive(Clone)]
2941
pub struct Compiler {
3042
// Core information, indexed by NodeId:

src/lexer.rs

Lines changed: 272 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,13 @@
1-
use crate::compiler::Span;
2-
use logos::Logos;
1+
use crate::compiler::{Span, Spanned};
2+
use logos::{Lexer, Logos};
3+
4+
#[derive(Debug, Default, Copy, Clone, PartialEq)]
5+
pub enum LexError {
6+
#[default]
7+
Generic,
8+
UnmatchedStrInterpLParen,
9+
UnmatchedStrInterpRParen,
10+
}
311

412
/// Average number of bytes per token used for estimating the tokens buffer size.
513
///
@@ -124,42 +132,156 @@ impl Tokens {
124132
}
125133
}
126134

127-
/// Lex the source contents and return allocated Tokens.
128-
///
129-
/// In the case of error, you can look up the last stored token to get a clue what went wrong. The
130-
/// last stored token is always End Of File (EOF), so there will always be at least one token.
131-
pub fn lex(contents: &[u8], span_offset: usize) -> (Tokens, Result<(), ()>) {
132-
// TODO: We might require the contents to always end with a newline, in which case return an error
133-
let mut tokens = Tokens::new(contents);
134-
let lexer = Token::lexer(contents).spanned();
135+
// TODO: Deduplicate code between lex_internal_dq_string_interp() and lex_internal_sq_string_interp()
136+
/// Lex the contents of a double-quoted string interpolation
137+
fn lex_internal_dq_string_interp(
138+
contents: &[u8],
139+
span_offset: usize,
140+
tokens: &mut Tokens,
141+
) -> Result<(), Spanned<LexError>> {
142+
let lexer = DqStrInterpToken::lexer(contents).spanned();
143+
144+
for (res, span) in lexer {
145+
let new_span = Span::new(span.start + span_offset, span.end + span_offset);
146+
match res {
147+
Ok(DqStrInterpToken::Start) => {
148+
tokens.push(Token::DqStringInterpStart, new_span);
149+
}
150+
Ok(DqStrInterpToken::StringChunk) => {
151+
tokens.push(Token::StrInterpChunk, new_span);
152+
}
153+
Ok(DqStrInterpToken::Subexpression) => {
154+
tokens.push(
155+
Token::StrInterpLParen,
156+
Span::new(new_span.start, new_span.start + 1),
157+
);
158+
159+
lex_internal(
160+
&contents[span.start + 1..span.end - 1],
161+
span_offset + span.start + 1,
162+
tokens,
163+
)?;
164+
165+
tokens.push(
166+
Token::StrInterpRParen,
167+
Span::new(new_span.end - 1, new_span.end),
168+
);
169+
}
170+
Ok(DqStrInterpToken::End) => {
171+
tokens.push(Token::StrInterpEnd, new_span);
172+
return Ok(());
173+
}
174+
Err(e) => {
175+
return Err(Spanned::new(e, new_span));
176+
}
177+
}
178+
}
179+
180+
Ok(())
181+
}
182+
183+
// TODO: Deduplicate code between lex_internal_dq_string_interp() and lex_internal_sq_string_interp()
184+
/// Lex the contents of a single-quoted string interpolation
185+
fn lex_internal_sq_string_interp(
186+
contents: &[u8],
187+
span_offset: usize,
188+
tokens: &mut Tokens,
189+
) -> Result<(), Spanned<LexError>> {
190+
let lexer = SqStrInterpToken::lexer(contents).spanned();
135191

136192
for (res, span) in lexer {
193+
let new_span = Span::new(span.start + span_offset, span.end + span_offset);
137194
match res {
138-
Ok(token) => tokens.push(
139-
token,
140-
Span::new(span.start + span_offset, span.end + span_offset),
141-
),
142-
Err(_) => {
195+
Ok(SqStrInterpToken::Start) => {
196+
tokens.push(Token::SqStringInterpStart, new_span);
197+
}
198+
Ok(SqStrInterpToken::StringChunk) => {
199+
tokens.push(Token::StrInterpChunk, new_span);
200+
}
201+
Ok(SqStrInterpToken::Subexpression) => {
143202
tokens.push(
144-
Token::Eof,
145-
Span::new(span.end + span_offset, span.end + span_offset),
203+
Token::StrInterpLParen,
204+
Span::new(new_span.start, new_span.start + 1),
146205
);
147-
return (tokens, Err(()));
206+
207+
lex_internal(
208+
&contents[span.start + 1..span.end - 1],
209+
span_offset + span.start + 1,
210+
tokens,
211+
)?;
212+
213+
tokens.push(
214+
Token::StrInterpRParen,
215+
Span::new(new_span.end - 1, new_span.end),
216+
);
217+
}
218+
Ok(SqStrInterpToken::End) => {
219+
tokens.push(Token::StrInterpEnd, new_span);
220+
return Ok(());
221+
}
222+
Err(e) => {
223+
return Err(Spanned::new(e, new_span));
224+
}
225+
}
226+
}
227+
228+
Ok(())
229+
}
230+
231+
fn lex_internal(
232+
contents: &[u8],
233+
span_offset: usize,
234+
tokens: &mut Tokens,
235+
) -> Result<(), Spanned<LexError>> {
236+
let lexer = Token::lexer(contents).spanned();
237+
238+
for (res, span) in lexer {
239+
let new_span = Span::new(span.start + span_offset, span.end + span_offset);
240+
match res {
241+
Ok(Token::DqStrInterp) => lex_internal_dq_string_interp(
242+
&contents[span.start..span.end],
243+
span_offset + span.start,
244+
tokens,
245+
)?,
246+
Ok(Token::SqStrInterp) => lex_internal_sq_string_interp(
247+
&contents[span.start..span.end],
248+
span_offset + span.start,
249+
tokens,
250+
)?,
251+
Ok(token) => tokens.push(token, new_span),
252+
Err(e) => {
253+
return Err(Spanned::new(e, new_span));
148254
}
149255
}
150256
}
151257

258+
Ok(())
259+
}
260+
261+
/// Lex the source contents and return allocated Tokens.
262+
///
263+
/// In the case of error, you can look up the last stored token to get a clue what went wrong. The
264+
/// last stored token is always End Of File (EOF), so there will always be at least one token.
265+
pub fn lex(contents: &[u8], span_offset: usize) -> (Tokens, Result<(), Spanned<LexError>>) {
266+
// TODO: We might require the contents to always end with a newline, in which case return an error
267+
let mut tokens = Tokens::new(contents);
268+
let res = lex_internal(contents, span_offset, &mut tokens);
269+
152270
tokens.push(
153271
Token::Eof,
154272
Span::new(contents.len() + span_offset, contents.len() + span_offset),
155273
);
156274

275+
if let Err(e) = res {
276+
return (tokens, Err(e));
277+
}
278+
157279
(tokens, Ok(()))
158280
}
159281

160282
#[derive(Logos, Debug, Clone, Copy, PartialEq)]
161283
#[logos(skip r"[ \t]+")]
162-
#[logos(source = [u8])]
284+
#[logos(source = [u8], error = LexError)]
163285
pub enum Token {
164286
#[regex("(0[xob])?[0-9][0-9_]*", priority = 10)]
165287
Int,
@@ -286,17 +408,111 @@ pub enum Token {
286408
ErrGreaterThanPipe,
287409
#[token("o+e>|")]
288410
OutErrGreaterThanPipe,
289-
/// End of file, doesn't match any syntax, but source code always end with it
411+
/// Double quoted string interpolation $"..."
412+
///
413+
/// The token is passed to a separate lexer and is not actually present in the result.
414+
/// Unescaped double quotes are not permitted, for example, $"foo("bar")" is not allowed.
415+
#[regex(r#"\$"([^"]|\\")*""#)]
416+
DqStrInterp,
417+
/// Single-quoted string interpolation $'...'
418+
///
419+
/// The token is passed to a separate lexer and is not actually present in the result.
420+
#[regex(r#"\$'[^']*'"#)]
421+
SqStrInterp,
422+
/// Start of double-quoted string interpoloation $" (returned from separate lexing)
423+
DqStringInterpStart,
424+
/// Start of single-quoted string interpoloation $' (returned from separate lexing)
425+
SqStringInterpStart,
426+
/// Non-interpolated string chunk within any string interpolation (returned from separate lexing)
427+
///
428+
/// For example, "foo" within $"foo(1)"
429+
StrInterpChunk,
430+
/// Left parenthesis inside any string interpolation (returned from separate lexing)
431+
StrInterpLParen,
432+
/// Right parenthesis inside any string interpolation (returned from separate lexing)
433+
StrInterpRParen,
434+
/// End of any string interpolation (returned from separate lexing)
435+
StrInterpEnd,
436+
/// End of file, doesn't match any syntax, but lexed tokens always end with it
290437
Eof,
291438
}
292439

440+
fn match_subexpression<'a, T: Logos<'a>>(
441+
remainder: &[u8],
442+
lexer: &mut Lexer<'a, T>,
443+
) -> Result<(), LexError> {
444+
let mut depth = 1;
445+
let mut pos = 0;
446+
447+
while pos < remainder.len() {
448+
match remainder[pos] {
449+
b'(' => depth += 1,
450+
b')' => depth -= 1,
451+
_ => (),
452+
}
453+
454+
if depth == 0 {
455+
break;
456+
}
457+
458+
if depth < 0 {
459+
// unmatched )
460+
return Err(LexError::UnmatchedStrInterpRParen);
461+
}
462+
463+
pos += 1;
464+
}
465+
466+
if depth > 0 {
467+
// unmatched (
468+
return Err(LexError::UnmatchedStrInterpLParen);
469+
}
470+
471+
lexer.bump(pos + 1);
472+
Ok(())
473+
}
474+
475+
/// Tokens representing double-quoted string interpolation
476+
#[derive(Logos, Debug, Clone, Copy, PartialEq)]
477+
#[logos(source = [u8], error = LexError)]
478+
enum DqStrInterpToken {
479+
#[token(r#"$""#)]
480+
Start,
481+
#[regex(r#"([^"\\\(]|\\["\\bnfrt\(])+"#)]
482+
StringChunk,
483+
#[token("(", |lex| match_subexpression(lex.remainder(), lex))]
484+
Subexpression,
485+
#[token(r#"""#)]
486+
End,
487+
}
488+
489+
/// Tokens representing single-quoted string interpolation
490+
#[derive(Logos, Debug, Clone, Copy, PartialEq)]
491+
#[logos(source = [u8], error=LexError)]
492+
enum SqStrInterpToken {
493+
#[token(r#"$'"#)]
494+
Start,
495+
#[regex(r#"[^'\(]+"#)]
496+
StringChunk,
497+
#[token("(", |lex| match_subexpression(lex.remainder(), lex))]
498+
Subexpression,
499+
#[token(r#"'"#)]
500+
End,
501+
}
502+
293503
#[cfg(test)]
294504
mod test {
295505
/// Lexer tests useful for smaller sources, errors and corner cases
296-
use crate::compiler::Span;
506+
use crate::compiler::{Span, Spanned};
297507
use crate::lexer::{lex, Token};
298508

299-
fn test_lex(src: &[u8], expected_tokens: &[(Token, Span)], expected_result: Result<(), ()>) {
509+
use super::LexError;
510+
511+
fn test_lex(
512+
src: &[u8],
513+
expected_tokens: &[(Token, Span)],
514+
expected_result: Result<(), Spanned<LexError>>,
515+
) {
300516
let (mut actual_tokens, actual_result) = lex(src, 0);
301517

302518
assert_eq!(expected_result, actual_result, "Lexing result mismatch");
@@ -320,6 +536,39 @@ mod test {
320536
#[test]
321537
fn lex_unmatched_string() {
322538
// TODO: Make unmatched delimiters nicer
323-
test_lex(b"'unmatched string", &[(Token::Eof, span(17, 17))], Err(()));
539+
test_lex(
540+
b"'unmatched string",
541+
&[(Token::Eof, span(17, 17))],
542+
Err(Spanned::new(LexError::Generic, Span::new(0, 17))),
543+
);
544+
}
545+
546+
#[test]
547+
fn lex_string_interp_errors() {
548+
test_lex(
549+
br#"$"foo("baz")bar""#,
550+
&[
551+
(Token::DqStringInterpStart, span(0, 2)),
552+
(Token::StrInterpChunk, span(2, 5)),
553+
(Token::Eof, span(16, 16)),
554+
],
555+
Err(Spanned::new(
556+
LexError::UnmatchedStrInterpLParen,
557+
Span::new(5, 6),
558+
)),
559+
);
560+
561+
test_lex(
562+
br#"$'foo('baz')bar'"#,
563+
&[
564+
(Token::SqStringInterpStart, span(0, 2)),
565+
(Token::StrInterpChunk, span(2, 5)),
566+
(Token::Eof, span(16, 16)),
567+
],
568+
Err(Spanned::new(
569+
LexError::UnmatchedStrInterpLParen,
570+
Span::new(5, 6),
571+
)),
572+
);
324573
}
325574
}

src/main.rs

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,13 @@ fn main() {
3434
let (tokens, err) = lex(&contents, span_offset);
3535
if let Err(e) = err {
3636
tokens.print(&compiler.source);
37-
eprintln!("Lexing error. Error: {:?}", e);
37+
eprintln!(
38+
"Lexing error. Error: {:?}, '{}'",
39+
e,
40+
String::from_utf8_lossy(
41+
compiler.get_span_contents_manual(e.span.start, e.span.end)
42+
)
43+
);
3844
exit(1);
3945
}
4046

0 commit comments

Comments
 (0)