1
- //! FIXME: write short doc here
1
+ //! Lexer analyzes raw input string and produces lexemes (tokens).
2
+
3
+ use std:: iter:: { FromIterator , IntoIterator } ;
2
4
3
5
use crate :: {
4
6
SyntaxKind :: { self , * } ,
@@ -13,85 +15,242 @@ pub struct Token {
13
15
/// The length of the token.
14
16
pub len : TextUnit ,
15
17
}
18
+ impl Token {
19
+ pub const fn new ( kind : SyntaxKind , len : TextUnit ) -> Self {
20
+ Self { kind, len }
21
+ }
22
+ }
16
23
17
- fn match_literal_kind ( kind : rustc_lexer:: LiteralKind ) -> SyntaxKind {
18
- match kind {
19
- rustc_lexer:: LiteralKind :: Int { .. } => INT_NUMBER ,
20
- rustc_lexer:: LiteralKind :: Float { .. } => FLOAT_NUMBER ,
21
- rustc_lexer:: LiteralKind :: Char { .. } => CHAR ,
22
- rustc_lexer:: LiteralKind :: Byte { .. } => BYTE ,
23
- rustc_lexer:: LiteralKind :: Str { .. } => STRING ,
24
- rustc_lexer:: LiteralKind :: ByteStr { .. } => BYTE_STRING ,
25
- rustc_lexer:: LiteralKind :: RawStr { .. } => RAW_STRING ,
26
- rustc_lexer:: LiteralKind :: RawByteStr { .. } => RAW_BYTE_STRING ,
24
+ #[ derive( Debug ) ]
25
+ /// Represents the result of parsing one token.
26
+ pub struct ParsedToken {
27
+ /// Parsed token.
28
+ pub token : Token ,
29
+ /// If error is present then parsed token is malformed.
30
+ pub error : Option < TokenizeError > ,
31
+ }
32
+ impl ParsedToken {
33
+ pub const fn new ( token : Token , error : Option < TokenizeError > ) -> Self {
34
+ Self { token, error }
27
35
}
28
36
}
29
37
38
+ #[ derive( Debug , Default ) ]
39
+ /// Represents the result of parsing one token.
40
+ pub struct ParsedTokens {
41
+ /// Parsed token.
42
+ pub tokens : Vec < Token > ,
43
+ /// If error is present then parsed token is malformed.
44
+ pub errors : Vec < TokenizeError > ,
45
+ }
46
+
47
+ impl FromIterator < ParsedToken > for ParsedTokens {
48
+ fn from_iter < I : IntoIterator < Item = ParsedToken > > ( iter : I ) -> Self {
49
+ let res = Self :: default ( ) ;
50
+ for entry in iter {
51
+ res. tokens . push ( entry. token ) ;
52
+ if let Some ( error) = entry. error {
53
+ res. errors . push ( error) ;
54
+ }
55
+ }
56
+ res
57
+ }
58
+ }
59
+
60
+ /// Returns the first encountered token from the string.
61
+ /// If the string contains zero or two or more tokens returns `None`.
62
+ pub fn single_token ( text : & str ) -> Option < ParsedToken > {
63
+ // TODO: test whether this condition indeed checks for a single token
64
+ first_token ( text) . filter ( |parsed| parsed. token . len . to_usize ( ) == text. len ( ) )
65
+ }
66
+
67
+ /*
68
+ /// Returns `ParsedTokens` which are basically a pair `(Vec<Token>, Vec<TokenizeError>)`
69
+ /// This is just a shorthand for `tokenize(text).collect()`
70
+ pub fn tokenize_to_vec_with_errors(text: &str) -> ParsedTokens {
71
+ tokenize(text).collect()
72
+ }
73
+
74
+ /// The simplest version of tokenize, it just retunst a ready-made `Vec<Token>`.
75
+ /// It discards all tokenization errors while parsing. If you need that infromation
76
+ /// consider using `tokenize()` or `tokenize_to_vec_with_errors()`.
77
+ pub fn tokenize_to_vec(text: &str) -> Vec<Token> {
78
+ tokenize(text).map(|parsed_token| parsed_token.token).collect()
79
+ }
80
+ */
81
+
30
82
/// Break a string up into its component tokens
31
- pub fn tokenize ( text : & str ) -> Vec < Token > {
32
- if text. is_empty ( ) {
33
- return vec ! [ ] ;
83
+ /// This is the core function, all other `tokenize*()` functions are simply
84
+ /// handy shortcuts for this one.
85
+ pub fn tokenize ( text : & str ) -> impl Iterator < Item = ParsedToken > + ' _ {
86
+ let shebang = rustc_lexer:: strip_shebang ( text) . map ( |shebang_len| {
87
+ text = & text[ shebang_len..] ;
88
+ ParsedToken :: new ( Token :: new ( SHEBANG , TextUnit :: from_usize ( shebang_len) ) , None )
89
+ } ) ;
90
+
91
+ // Notice that we eagerly evaluate shebang since it may change text slice
92
+ // and we cannot simplify this into a single method call chain
93
+ shebang. into_iter ( ) . chain ( tokenize_without_shebang ( text) )
94
+ }
95
+
96
+ pub fn tokenize_without_shebang ( text : & str ) -> impl Iterator < Item = ParsedToken > + ' _ {
97
+ rustc_lexer:: tokenize ( text) . map ( |rustc_token| {
98
+ let token_text = & text[ ..rustc_token. len ] ;
99
+ text = & text[ rustc_token. len ..] ;
100
+ rustc_token_kind_to_parsed_token ( & rustc_token. kind , token_text)
101
+ } )
102
+ }
103
+
104
+ #[ derive( Debug ) ]
105
+ pub enum TokenizeError {
106
+ /// Base prefix was provided, but there were no digits
107
+ /// after it, e.g. `0x`.
108
+ EmptyInt ,
109
+ /// Float exponent lacks digits e.g. `e+`, `E+`, `e-`, `E-`,
110
+ EmptyExponent ,
111
+
112
+ /// Block comment lacks trailing delimiter `*/`
113
+ UnterminatedBlockComment ,
114
+ /// Character literal lacks trailing delimiter `'`
115
+ UnterminatedChar ,
116
+ /// Characterish byte literal lacks trailing delimiter `'`
117
+ UnterminatedByte ,
118
+ /// String literal lacks trailing delimiter `"`
119
+ UnterminatedString ,
120
+ /// Byte string literal lacks trailing delimiter `"`
121
+ UnterminatedByteString ,
122
+ /// Raw literal lacks trailing delimiter e.g. `"##`
123
+ UnterminatedRawString ,
124
+ /// Raw byte string literal lacks trailing delimiter e.g. `"##`
125
+ UnterminatedRawByteString ,
126
+
127
+ /// Raw string lacks a quote after pound characters e.g. `r###`
128
+ UnstartedRawString ,
129
+ /// Raw byte string lacks a quote after pound characters e.g. `br###`
130
+ UnstartedRawByteString ,
131
+
132
+ /// Lifetime starts with a number e.g. `'4ever`
133
+ LifetimeStartsWithNumber ,
134
+ }
135
+
136
+ fn rustc_token_kind_to_parsed_token (
137
+ rustc_token_kind : & rustc_lexer:: TokenKind ,
138
+ token_text : & str ,
139
+ ) -> ParsedToken {
140
+ use rustc_lexer:: TokenKind as TK ;
141
+ use TokenizeError as TE ;
142
+
143
+ // We drop some useful infromation here (see patterns with double dots `..`)
144
+ // Storing that info in `SyntaxKind` is not possible due to its layout requirements of
145
+ // being `u16` that come from `rowan::SyntaxKind` type and changes to `rowan::SyntaxKind`
146
+ // would mean hell of a rewrite.
147
+
148
+ let ( syntax_kind, error) = match * rustc_token_kind {
149
+ TK :: LineComment => ok ( COMMENT ) ,
150
+ TK :: BlockComment { terminated } => ok_if ( terminated, COMMENT , TE :: UnterminatedBlockComment ) ,
151
+ TK :: Whitespace => ok ( WHITESPACE ) ,
152
+ TK :: Ident => ok ( if token_text == "_" {
153
+ UNDERSCORE
154
+ } else {
155
+ SyntaxKind :: from_keyword ( token_text) . unwrap_or ( IDENT )
156
+ } ) ,
157
+ TK :: RawIdent => ok ( IDENT ) ,
158
+ TK :: Literal { kind, .. } => match_literal_kind ( & kind) ,
159
+ TK :: Lifetime { starts_with_number } => {
160
+ ok_if ( !starts_with_number, LIFETIME , TE :: LifetimeStartsWithNumber )
161
+ }
162
+ TK :: Semi => ok ( SEMI ) ,
163
+ TK :: Comma => ok ( COMMA ) ,
164
+ TK :: Dot => ok ( DOT ) ,
165
+ TK :: OpenParen => ok ( L_PAREN ) ,
166
+ TK :: CloseParen => ok ( R_PAREN ) ,
167
+ TK :: OpenBrace => ok ( L_CURLY ) ,
168
+ TK :: CloseBrace => ok ( R_CURLY ) ,
169
+ TK :: OpenBracket => ok ( L_BRACK ) ,
170
+ TK :: CloseBracket => ok ( R_BRACK ) ,
171
+ TK :: At => ok ( AT ) ,
172
+ TK :: Pound => ok ( POUND ) ,
173
+ TK :: Tilde => ok ( TILDE ) ,
174
+ TK :: Question => ok ( QUESTION ) ,
175
+ TK :: Colon => ok ( COLON ) ,
176
+ TK :: Dollar => ok ( DOLLAR ) ,
177
+ TK :: Eq => ok ( EQ ) ,
178
+ TK :: Not => ok ( EXCL ) ,
179
+ TK :: Lt => ok ( L_ANGLE ) ,
180
+ TK :: Gt => ok ( R_ANGLE ) ,
181
+ TK :: Minus => ok ( MINUS ) ,
182
+ TK :: And => ok ( AMP ) ,
183
+ TK :: Or => ok ( PIPE ) ,
184
+ TK :: Plus => ok ( PLUS ) ,
185
+ TK :: Star => ok ( STAR ) ,
186
+ TK :: Slash => ok ( SLASH ) ,
187
+ TK :: Caret => ok ( CARET ) ,
188
+ TK :: Percent => ok ( PERCENT ) ,
189
+ TK :: Unknown => ok ( ERROR ) ,
190
+ } ;
191
+
192
+ return ParsedToken :: new (
193
+ Token :: new ( syntax_kind, TextUnit :: from_usize ( token_text. len ( ) ) ) ,
194
+ error,
195
+ ) ;
196
+
197
+ type ParsedSyntaxKind = ( SyntaxKind , Option < TokenizeError > ) ;
198
+
199
+ const fn ok ( syntax_kind : SyntaxKind ) -> ParsedSyntaxKind {
200
+ ( syntax_kind, None )
34
201
}
35
- let mut text = text;
36
- let mut acc = Vec :: new ( ) ;
37
- if let Some ( len) = rustc_lexer:: strip_shebang ( text) {
38
- acc. push ( Token { kind : SHEBANG , len : TextUnit :: from_usize ( len) } ) ;
39
- text = & text[ len..] ;
202
+ const fn ok_if ( cond : bool , syntax_kind : SyntaxKind , error : TokenizeError ) -> ParsedSyntaxKind {
203
+ if cond {
204
+ ok ( syntax_kind)
205
+ } else {
206
+ err ( syntax_kind, error)
207
+ }
40
208
}
41
- while !text. is_empty ( ) {
42
- let rustc_token = rustc_lexer:: first_token ( text) ;
43
- let kind = match rustc_token. kind {
44
- rustc_lexer:: TokenKind :: LineComment => COMMENT ,
45
- rustc_lexer:: TokenKind :: BlockComment { .. } => COMMENT ,
46
- rustc_lexer:: TokenKind :: Whitespace => WHITESPACE ,
47
- rustc_lexer:: TokenKind :: Ident => {
48
- let token_text = & text[ ..rustc_token. len ] ;
49
- if token_text == "_" {
50
- UNDERSCORE
51
- } else {
52
- SyntaxKind :: from_keyword ( & text[ ..rustc_token. len ] ) . unwrap_or ( IDENT )
53
- }
209
+ const fn err ( syntax_kind : SyntaxKind , error : TokenizeError ) -> ParsedSyntaxKind {
210
+ ( syntax_kind, Some ( error) )
211
+ }
212
+
213
+ const fn match_literal_kind ( kind : & rustc_lexer:: LiteralKind ) -> ParsedSyntaxKind {
214
+ use rustc_lexer:: LiteralKind as LK ;
215
+ match * kind {
216
+ LK :: Int { empty_int, .. } => ok_if ( !empty_int, INT_NUMBER , TE :: EmptyInt ) ,
217
+ LK :: Float { empty_exponent, .. } => {
218
+ ok_if ( !empty_exponent, FLOAT_NUMBER , TE :: EmptyExponent )
54
219
}
55
- rustc_lexer:: TokenKind :: RawIdent => IDENT ,
56
- rustc_lexer:: TokenKind :: Literal { kind, .. } => match_literal_kind ( kind) ,
57
- rustc_lexer:: TokenKind :: Lifetime { .. } => LIFETIME ,
58
- rustc_lexer:: TokenKind :: Semi => SEMI ,
59
- rustc_lexer:: TokenKind :: Comma => COMMA ,
60
- rustc_lexer:: TokenKind :: Dot => DOT ,
61
- rustc_lexer:: TokenKind :: OpenParen => L_PAREN ,
62
- rustc_lexer:: TokenKind :: CloseParen => R_PAREN ,
63
- rustc_lexer:: TokenKind :: OpenBrace => L_CURLY ,
64
- rustc_lexer:: TokenKind :: CloseBrace => R_CURLY ,
65
- rustc_lexer:: TokenKind :: OpenBracket => L_BRACK ,
66
- rustc_lexer:: TokenKind :: CloseBracket => R_BRACK ,
67
- rustc_lexer:: TokenKind :: At => AT ,
68
- rustc_lexer:: TokenKind :: Pound => POUND ,
69
- rustc_lexer:: TokenKind :: Tilde => TILDE ,
70
- rustc_lexer:: TokenKind :: Question => QUESTION ,
71
- rustc_lexer:: TokenKind :: Colon => COLON ,
72
- rustc_lexer:: TokenKind :: Dollar => DOLLAR ,
73
- rustc_lexer:: TokenKind :: Eq => EQ ,
74
- rustc_lexer:: TokenKind :: Not => EXCL ,
75
- rustc_lexer:: TokenKind :: Lt => L_ANGLE ,
76
- rustc_lexer:: TokenKind :: Gt => R_ANGLE ,
77
- rustc_lexer:: TokenKind :: Minus => MINUS ,
78
- rustc_lexer:: TokenKind :: And => AMP ,
79
- rustc_lexer:: TokenKind :: Or => PIPE ,
80
- rustc_lexer:: TokenKind :: Plus => PLUS ,
81
- rustc_lexer:: TokenKind :: Star => STAR ,
82
- rustc_lexer:: TokenKind :: Slash => SLASH ,
83
- rustc_lexer:: TokenKind :: Caret => CARET ,
84
- rustc_lexer:: TokenKind :: Percent => PERCENT ,
85
- rustc_lexer:: TokenKind :: Unknown => ERROR ,
86
- } ;
87
- let token = Token { kind, len : TextUnit :: from_usize ( rustc_token. len ) } ;
88
- acc. push ( token) ;
89
- text = & text[ rustc_token. len ..] ;
220
+ LK :: Char { terminated } => ok_if ( terminated, CHAR , TE :: UnterminatedChar ) ,
221
+ LK :: Byte { terminated } => ok_if ( terminated, BYTE , TE :: UnterminatedByte ) ,
222
+ LK :: Str { terminated } => ok_if ( terminated, STRING , TE :: UnterminatedString ) ,
223
+ LK :: ByteStr { terminated } => {
224
+ ok_if ( terminated, BYTE_STRING , TE :: UnterminatedByteString )
225
+ }
226
+
227
+ LK :: RawStr { started : true , terminated, .. } => {
228
+ ok_if ( terminated, RAW_STRING , TE :: UnterminatedRawString )
229
+ }
230
+ LK :: RawStr { started : false , .. } => err ( RAW_STRING , TE :: UnstartedRawString ) ,
231
+
232
+ LK :: RawByteStr { started : true , terminated, .. } => {
233
+ ok_if ( terminated, RAW_BYTE_STRING , TE :: UnterminatedRawByteString )
234
+ }
235
+ LK :: RawByteStr { started : false , .. } => {
236
+ err ( RAW_BYTE_STRING , TE :: UnstartedRawByteString )
237
+ }
238
+ }
239
+ }
240
+ }
241
+
242
+ pub fn first_token ( text : & str ) -> Option < ParsedToken > {
243
+ // Checking for emptyness because of `rustc_lexer::first_token()` invariant (see its body)
244
+ if text. is_empty ( ) {
245
+ None
246
+ } else {
247
+ let rustc_token = rustc_lexer:: first_token ( text) ;
248
+ Some ( rustc_token_kind_to_parsed_token ( & rustc_token. kind , & text[ ..rustc_token. len ] ) )
90
249
}
91
- acc
92
250
}
93
251
94
- pub fn classify_literal ( text : & str ) -> Option < Token > {
252
+ // TODO: think what to do with this ad hoc function
253
+ pub fn classify_literal ( text : & str ) -> Option < ParsedToken > {
95
254
let t = rustc_lexer:: first_token ( text) ;
96
255
if t. len != text. len ( ) {
97
256
return None ;
@@ -100,5 +259,5 @@ pub fn classify_literal(text: &str) -> Option<Token> {
100
259
rustc_lexer:: TokenKind :: Literal { kind, .. } => match_literal_kind ( kind) ,
101
260
_ => return None ,
102
261
} ;
103
- Some ( Token { kind, len : TextUnit :: from_usize ( t. len ) } )
262
+ Some ( ParsedToken :: new ( Token :: new ( kind, TextUnit :: from_usize ( t. len ) ) ) )
104
263
}
0 commit comments