1- use crate :: compiler:: Span ;
2- use logos:: Logos ;
1+ use crate :: compiler:: { Span , Spanned } ;
2+ use logos:: { Lexer , Logos } ;
3+
4+ #[ derive( Debug , Default , Copy , Clone , PartialEq ) ]
5+ pub enum LexError {
6+ #[ default]
7+ Generic ,
8+ UnmatchedStrInterpLParen ,
9+ UnmatchedStrInterpRParen ,
10+ }
311
412/// Average number of bytes per token used for estimating the tokens buffer size.
513///
@@ -124,42 +132,156 @@ impl Tokens {
124132 }
125133}
126134
127- /// Lex the source contents and return allocated Tokens.
128- ///
129- /// In the case of error, you can look up the last stored token to get a clue what went wrong. The
130- /// last stored token is always End Of File (EOF), so there will always be at least one token.
131- pub fn lex ( contents : & [ u8 ] , span_offset : usize ) -> ( Tokens , Result < ( ) , ( ) > ) {
132- // TODO: We might require the contents to always end with a newline, in which case return an error
133- let mut tokens = Tokens :: new ( contents) ;
134- let lexer = Token :: lexer ( contents) . spanned ( ) ;
135+ // TODO: Deduplicate code between lex_internal_dq_string_interp() and lex_internal_sq_string_interp()
136+ /// Lex the contents of a double-quoted string interpolation
137+ fn lex_internal_dq_string_interp (
138+ contents : & [ u8 ] ,
139+ span_offset : usize ,
140+ tokens : & mut Tokens ,
141+ ) -> Result < ( ) , Spanned < LexError > > {
142+ let lexer = DqStrInterpToken :: lexer ( contents) . spanned ( ) ;
143+
144+ for ( res, span) in lexer {
145+ let new_span = Span :: new ( span. start + span_offset, span. end + span_offset) ;
146+ match res {
147+ Ok ( DqStrInterpToken :: Start ) => {
148+ tokens. push ( Token :: DqStringInterpStart , new_span) ;
149+ }
150+ Ok ( DqStrInterpToken :: StringChunk ) => {
151+ tokens. push ( Token :: StrInterpChunk , new_span) ;
152+ }
153+ Ok ( DqStrInterpToken :: Subexpression ) => {
154+ tokens. push (
155+ Token :: StrInterpLParen ,
156+ Span :: new ( new_span. start , new_span. start + 1 ) ,
157+ ) ;
158+
159+ lex_internal (
160+ & contents[ span. start + 1 ..span. end - 1 ] ,
161+ span_offset + span. start + 1 ,
162+ tokens,
163+ ) ?;
164+
165+ tokens. push (
166+ Token :: StrInterpRParen ,
167+ Span :: new ( new_span. end - 1 , new_span. end ) ,
168+ ) ;
169+ }
170+ Ok ( DqStrInterpToken :: End ) => {
171+ tokens. push ( Token :: StrInterpEnd , new_span) ;
172+ return Ok ( ( ) ) ;
173+ }
174+ Err ( e) => {
175+ return Err ( Spanned :: new ( e, new_span) ) ;
176+ }
177+ }
178+ }
179+
180+ Ok ( ( ) )
181+ }
182+
183+ // TODO: Deduplicate code between lex_internal_dq_string_interp() and lex_internal_sq_string_interp()
184+ /// Lex the contents of a single-quoted string interpolation
185+ fn lex_internal_sq_string_interp (
186+ contents : & [ u8 ] ,
187+ span_offset : usize ,
188+ tokens : & mut Tokens ,
189+ ) -> Result < ( ) , Spanned < LexError > > {
190+ let lexer = SqStrInterpToken :: lexer ( contents) . spanned ( ) ;
135191
136192 for ( res, span) in lexer {
193+ let new_span = Span :: new ( span. start + span_offset, span. end + span_offset) ;
137194 match res {
138- Ok ( token) => tokens. push (
139- token,
140- Span :: new ( span. start + span_offset, span. end + span_offset) ,
141- ) ,
142- Err ( _) => {
195+ Ok ( SqStrInterpToken :: Start ) => {
196+ tokens. push ( Token :: SqStringInterpStart , new_span) ;
197+ }
198+ Ok ( SqStrInterpToken :: StringChunk ) => {
199+ tokens. push ( Token :: StrInterpChunk , new_span) ;
200+ }
201+ Ok ( SqStrInterpToken :: Subexpression ) => {
143202 tokens. push (
144- Token :: Eof ,
145- Span :: new ( span . end + span_offset , span . end + span_offset ) ,
203+ Token :: StrInterpLParen ,
204+ Span :: new ( new_span . start , new_span . start + 1 ) ,
146205 ) ;
147- return ( tokens, Err ( ( ) ) ) ;
206+
207+ lex_internal (
208+ & contents[ span. start + 1 ..span. end - 1 ] ,
209+ span_offset + span. start + 1 ,
210+ tokens,
211+ ) ?;
212+
213+ tokens. push (
214+ Token :: StrInterpRParen ,
215+ Span :: new ( new_span. end - 1 , new_span. end ) ,
216+ ) ;
217+ }
218+ Ok ( SqStrInterpToken :: End ) => {
219+ tokens. push ( Token :: StrInterpEnd , new_span) ;
220+ return Ok ( ( ) ) ;
221+ }
222+ Err ( e) => {
223+ return Err ( Spanned :: new ( e, new_span) ) ;
224+ }
225+ }
226+ }
227+
228+ Ok ( ( ) )
229+ }
230+
231+ fn lex_internal (
232+ contents : & [ u8 ] ,
233+ span_offset : usize ,
234+ tokens : & mut Tokens ,
235+ ) -> Result < ( ) , Spanned < LexError > > {
236+ let lexer = Token :: lexer ( contents) . spanned ( ) ;
237+
238+ for ( res, span) in lexer {
239+ let new_span = Span :: new ( span. start + span_offset, span. end + span_offset) ;
240+ match res {
241+ Ok ( Token :: DqStrInterp ) => lex_internal_dq_string_interp (
242+ & contents[ span. start ..span. end ] ,
243+ span_offset + span. start ,
244+ tokens,
245+ ) ?,
246+ Ok ( Token :: SqStrInterp ) => lex_internal_sq_string_interp (
247+ & contents[ span. start ..span. end ] ,
248+ span_offset + span. start ,
249+ tokens,
250+ ) ?,
251+ Ok ( token) => tokens. push ( token, new_span) ,
252+ Err ( e) => {
253+ return Err ( Spanned :: new ( e, new_span) ) ;
148254 }
149255 }
150256 }
151257
258+ Ok ( ( ) )
259+ }
260+
261+ /// Lex the source contents and return allocated Tokens.
262+ ///
263+ /// In the case of error, you can look up the last stored token to get a clue what went wrong. The
264+ /// last stored token is always End Of File (EOF), so there will always be at least one token.
265+ pub fn lex ( contents : & [ u8 ] , span_offset : usize ) -> ( Tokens , Result < ( ) , Spanned < LexError > > ) {
266+ // TODO: We might require the contents to always end with a newline, in which case return an error
267+ let mut tokens = Tokens :: new ( contents) ;
268+ let res = lex_internal ( contents, span_offset, & mut tokens) ;
269+
152270 tokens. push (
153271 Token :: Eof ,
154272 Span :: new ( contents. len ( ) + span_offset, contents. len ( ) + span_offset) ,
155273 ) ;
156274
275+ if let Err ( e) = res {
276+ return ( tokens, Err ( e) ) ;
277+ }
278+
157279 ( tokens, Ok ( ( ) ) )
158280}
159281
160282#[ derive( Logos , Debug , Clone , Copy , PartialEq ) ]
161283#[ logos( skip r"[ \t]+" ) ]
162- #[ logos( source = [ u8 ] ) ]
284+ #[ logos( source = [ u8 ] , error = LexError ) ]
163285pub enum Token {
164286 #[ regex( "(0[xob])?[0-9][0-9_]*" , priority = 10 ) ]
165287 Int ,
@@ -286,17 +408,111 @@ pub enum Token {
286408 ErrGreaterThanPipe ,
287409 #[ token( "o+e>|" ) ]
288410 OutErrGreaterThanPipe ,
289- /// End of file, doesn't match any syntax, but source code always end with it
411+ /// Double quoted string interpolation $"..."
412+ ///
413+ /// The token is passed to a separate lexer and is not actually present in the result.
414+ /// Unescaped double quotes are not permitted, for example, $"foo("bar")" is not allowed.
415+ #[ regex( r#"\$"([^"]|\\")*""# ) ]
416+ DqStrInterp ,
417+ /// Single-quoted string interpolation $'...'
418+ ///
419+ /// The token is passed to a separate lexer and is not actually present in the result.
420+ #[ regex( r#"\$'[^']*'"# ) ]
421+ SqStrInterp ,
422+ /// Start of double-quoted string interpoloation $" (returned from separate lexing)
423+ DqStringInterpStart ,
424+ /// Start of single-quoted string interpoloation $' (returned from separate lexing)
425+ SqStringInterpStart ,
426+ /// Non-interpolated string chunk within any string interpolation (returned from separate lexing)
427+ ///
428+ /// For example, "foo" within $"foo(1)"
429+ StrInterpChunk ,
430+ /// Left parenthesis inside any string interpolation (returned from separate lexing)
431+ StrInterpLParen ,
432+ /// Right parenthesis inside any string interpolation (returned from separate lexing)
433+ StrInterpRParen ,
434+ /// End of any string interpolation (returned from separate lexing)
435+ StrInterpEnd ,
436+ /// End of file, doesn't match any syntax, but lexed tokens always end with it
290437 Eof ,
291438}
292439
440+ fn match_subexpression < ' a , T : Logos < ' a > > (
441+ remainder : & [ u8 ] ,
442+ lexer : & mut Lexer < ' a , T > ,
443+ ) -> Result < ( ) , LexError > {
444+ let mut depth = 1 ;
445+ let mut pos = 0 ;
446+
447+ while pos < remainder. len ( ) {
448+ match remainder[ pos] {
449+ b'(' => depth += 1 ,
450+ b')' => depth -= 1 ,
451+ _ => ( ) ,
452+ }
453+
454+ if depth == 0 {
455+ break ;
456+ }
457+
458+ if depth < 0 {
459+ // unmatched )
460+ return Err ( LexError :: UnmatchedStrInterpRParen ) ;
461+ }
462+
463+ pos += 1 ;
464+ }
465+
466+ if depth > 0 {
467+ // unmatched (
468+ return Err ( LexError :: UnmatchedStrInterpLParen ) ;
469+ }
470+
471+ lexer. bump ( pos + 1 ) ;
472+ Ok ( ( ) )
473+ }
474+
475+ /// Tokens representing double-quoted string interpolation
476+ #[ derive( Logos , Debug , Clone , Copy , PartialEq ) ]
477+ #[ logos( source = [ u8 ] , error = LexError ) ]
478+ enum DqStrInterpToken {
479+ #[ token( r#"$""# ) ]
480+ Start ,
481+ #[ regex( r#"([^"\\\(]|\\["\\bnfrt\(])+"# ) ]
482+ StringChunk ,
483+ #[ token( "(" , |lex| match_subexpression( lex. remainder( ) , lex) ) ]
484+ Subexpression ,
485+ #[ token( r#"""# ) ]
486+ End ,
487+ }
488+
489+ /// Tokens representing single-quoted string interpolation
490+ #[ derive( Logos , Debug , Clone , Copy , PartialEq ) ]
491+ #[ logos( source = [ u8 ] , error=LexError ) ]
492+ enum SqStrInterpToken {
493+ #[ token( r#"$'"# ) ]
494+ Start ,
495+ #[ regex( r#"[^'\(]+"# ) ]
496+ StringChunk ,
497+ #[ token( "(" , |lex| match_subexpression( lex. remainder( ) , lex) ) ]
498+ Subexpression ,
499+ #[ token( r#"'"# ) ]
500+ End ,
501+ }
502+
293503#[ cfg( test) ]
294504mod test {
295505 /// Lexer tests useful for smaller sources, errors and corner cases
296- use crate :: compiler:: Span ;
506+ use crate :: compiler:: { Span , Spanned } ;
297507 use crate :: lexer:: { lex, Token } ;
298508
299- fn test_lex ( src : & [ u8 ] , expected_tokens : & [ ( Token , Span ) ] , expected_result : Result < ( ) , ( ) > ) {
509+ use super :: LexError ;
510+
511+ fn test_lex (
512+ src : & [ u8 ] ,
513+ expected_tokens : & [ ( Token , Span ) ] ,
514+ expected_result : Result < ( ) , Spanned < LexError > > ,
515+ ) {
300516 let ( mut actual_tokens, actual_result) = lex ( src, 0 ) ;
301517
302518 assert_eq ! ( expected_result, actual_result, "Lexing result mismatch" ) ;
@@ -320,6 +536,39 @@ mod test {
320536 #[ test]
321537 fn lex_unmatched_string ( ) {
322538 // TODO: Make unmatched delimiters nicer
323- test_lex ( b"'unmatched string" , & [ ( Token :: Eof , span ( 17 , 17 ) ) ] , Err ( ( ) ) ) ;
539+ test_lex (
540+ b"'unmatched string" ,
541+ & [ ( Token :: Eof , span ( 17 , 17 ) ) ] ,
542+ Err ( Spanned :: new ( LexError :: Generic , Span :: new ( 0 , 17 ) ) ) ,
543+ ) ;
544+ }
545+
546+ #[ test]
547+ fn lex_string_interp_errors ( ) {
548+ test_lex (
549+ br#"$"foo("baz")bar""# ,
550+ & [
551+ ( Token :: DqStringInterpStart , span ( 0 , 2 ) ) ,
552+ ( Token :: StrInterpChunk , span ( 2 , 5 ) ) ,
553+ ( Token :: Eof , span ( 16 , 16 ) ) ,
554+ ] ,
555+ Err ( Spanned :: new (
556+ LexError :: UnmatchedStrInterpLParen ,
557+ Span :: new ( 5 , 6 ) ,
558+ ) ) ,
559+ ) ;
560+
561+ test_lex (
562+ br#"$'foo('baz')bar'"# ,
563+ & [
564+ ( Token :: SqStringInterpStart , span ( 0 , 2 ) ) ,
565+ ( Token :: StrInterpChunk , span ( 2 , 5 ) ) ,
566+ ( Token :: Eof , span ( 16 , 16 ) ) ,
567+ ] ,
568+ Err ( Spanned :: new (
569+ LexError :: UnmatchedStrInterpLParen ,
570+ Span :: new ( 5 , 6 ) ,
571+ ) ) ,
572+ ) ;
324573 }
325574}
0 commit comments