Skip to content

Commit 010b3ea

Browse files
committed
Expose state of LuaLexer
This will allow parsing code snippets in comments line-by-line.
1 parent 8184dec commit 010b3ea

File tree

4 files changed

+136
-99
lines changed

4 files changed

+136
-99
lines changed

crates/emmylua_parser/src/lexer/lua_lexer.rs

Lines changed: 122 additions & 88 deletions
Original file line numberDiff line numberDiff line change
@@ -5,27 +5,58 @@ use super::{is_name_continue, is_name_start, lexer_config::LexerConfig, token_da
55
pub struct LuaLexer<'a> {
66
reader: Reader<'a>,
77
lexer_config: LexerConfig,
8-
errors: &'a mut Vec<LuaParseError>,
8+
errors: Option<&'a mut Vec<LuaParseError>>,
9+
state: LuaLexerState,
910
}
1011

11-
impl LuaLexer<'_> {
12-
pub fn new<'a>(
13-
text: &'a str,
12+
/// This enum allows preserving lexer state between reader resets. This is used
13+
/// when lexer doesn't see the whole input source, and only sees a reader
14+
/// for each individual line. It happens when we're lexing
15+
/// code blocks in comments.
16+
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
17+
pub enum LuaLexerState {
18+
Normal,
19+
String(char),
20+
LongString(usize),
21+
LongComment(usize),
22+
}
23+
24+
impl<'a> LuaLexer<'a> {
25+
pub fn new(
26+
reader: Reader<'a>,
27+
lexer_config: LexerConfig,
28+
errors: Option<&'a mut Vec<LuaParseError>>,
29+
) -> Self {
30+
Self::new_with_state(reader, LuaLexerState::Normal, lexer_config, errors)
31+
}
32+
33+
pub fn new_with_state(
34+
reader: Reader<'a>,
35+
state: LuaLexerState,
1436
lexer_config: LexerConfig,
15-
errors: &'a mut Vec<LuaParseError>,
16-
) -> LuaLexer<'a> {
37+
errors: Option<&'a mut Vec<LuaParseError>>,
38+
) -> Self {
1739
LuaLexer {
18-
reader: Reader::new(text),
40+
reader,
1941
lexer_config,
2042
errors,
43+
state,
2144
}
2245
}
2346

2447
pub fn tokenize(&mut self) -> Vec<LuaTokenData> {
2548
let mut tokens = vec![];
2649

2750
while !self.reader.is_eof() {
28-
let kind = self.lex();
51+
let kind = match self.state {
52+
LuaLexerState::Normal => self.lex(),
53+
LuaLexerState::String(quote) => self.lex_string(quote),
54+
LuaLexerState::LongString(sep) => self.lex_long_string(sep),
55+
LuaLexerState::LongComment(sep) => {
56+
self.lex_long_string(sep);
57+
LuaTokenKind::TkLongComment
58+
}
59+
};
2960
if kind == LuaTokenKind::TkEof {
3061
break;
3162
}
@@ -36,6 +67,16 @@ impl LuaLexer<'_> {
3667
tokens
3768
}
3869

70+
pub fn get_state(&self) -> LuaLexerState {
71+
self.state
72+
}
73+
74+
pub fn continue_with_new_reader(&mut self, reader: Reader<'a>) -> Vec<LuaTokenData> {
75+
assert!(self.reader.is_eof(), "previous reader wasn't exhausted");
76+
self.reader = reader;
77+
self.tokenize()
78+
}
79+
3980
fn support_non_std_symbol(&self, symbol: LuaNonStdSymbol) -> bool {
4081
self.lexer_config.non_std_symbols.support(symbol)
4182
}
@@ -105,6 +146,7 @@ impl LuaLexer<'_> {
105146
let sep = self.skip_sep();
106147
if self.reader.current_char() == '[' {
107148
self.reader.bump();
149+
self.state = LuaLexerState::LongComment(sep);
108150
self.lex_long_string(sep);
109151
return LuaTokenKind::TkLongComment;
110152
}
@@ -120,14 +162,12 @@ impl LuaLexer<'_> {
120162
return LuaTokenKind::TkLeftBracket;
121163
}
122164
if self.reader.current_char() != '[' {
123-
self.errors.push(LuaParseError::syntax_error_from(
124-
&t!("invalid long string delimiter"),
125-
self.reader.current_range(),
126-
));
165+
self.error(|| t!("invalid long string delimiter"));
127166
return LuaTokenKind::TkLongString;
128167
}
129168

130169
self.reader.bump();
170+
self.state = LuaLexerState::LongString(sep);
131171
self.lex_long_string(sep)
132172
}
133173
'=' => {
@@ -147,10 +187,7 @@ impl LuaLexer<'_> {
147187
}
148188
'<' => {
149189
if !self.lexer_config.support_integer_operation() {
150-
self.errors.push(LuaParseError::syntax_error_from(
151-
&t!("bitwise operation is not supported"),
152-
self.reader.current_range(),
153-
));
190+
self.error(|| t!("bitwise operation is not supported"));
154191
}
155192

156193
self.reader.bump();
@@ -174,10 +211,7 @@ impl LuaLexer<'_> {
174211
}
175212
'>' => {
176213
if !self.lexer_config.support_integer_operation() {
177-
self.errors.push(LuaParseError::syntax_error_from(
178-
&t!("bitwise operation is not supported"),
179-
self.reader.current_range(),
180-
));
214+
self.error(|| t!("bitwise operation is not supported"));
181215
}
182216

183217
self.reader.bump();
@@ -196,10 +230,7 @@ impl LuaLexer<'_> {
196230
self.reader.bump();
197231
if self.reader.current_char() != '=' {
198232
if !self.lexer_config.support_integer_operation() {
199-
self.errors.push(LuaParseError::syntax_error_from(
200-
&t!("bitwise operation is not supported"),
201-
self.reader.current_range(),
202-
));
233+
self.error(|| t!("bitwise operation is not supported"));
203234
}
204235
return LuaTokenKind::TkBitXor;
205236
}
@@ -222,43 +253,8 @@ impl LuaLexer<'_> {
222253
}
223254

224255
self.reader.bump();
225-
while !self.reader.is_eof() {
226-
let ch = self.reader.current_char();
227-
if ch == quote || ch == '\n' || ch == '\r' {
228-
break;
229-
}
230-
231-
if ch != '\\' {
232-
self.reader.bump();
233-
continue;
234-
}
235-
236-
self.reader.bump();
237-
match self.reader.current_char() {
238-
'z' => {
239-
self.reader.bump();
240-
self.reader
241-
.eat_while(|c| c == ' ' || c == '\t' || c == '\r' || c == '\n');
242-
}
243-
'\r' | '\n' => {
244-
self.lex_new_line();
245-
}
246-
_ => {
247-
self.reader.bump();
248-
}
249-
}
250-
}
251-
252-
if self.reader.current_char() != quote {
253-
self.errors.push(LuaParseError::syntax_error_from(
254-
&t!("unfinished string"),
255-
self.reader.current_range(),
256-
));
257-
return LuaTokenKind::TkString;
258-
}
259-
260-
self.reader.bump();
261-
LuaTokenKind::TkString
256+
self.state = LuaLexerState::String(quote);
257+
self.lex_string(quote)
262258
}
263259
'.' => {
264260
if self.reader.next_char().is_ascii_digit() {
@@ -295,10 +291,7 @@ impl LuaLexer<'_> {
295291
}
296292
}
297293
_ if self.reader.is_eof() => {
298-
self.errors.push(LuaParseError::syntax_error_from(
299-
&t!("unfinished long comment"),
300-
self.reader.current_range(),
301-
));
294+
self.error(|| t!("unfinished long comment"));
302295
return LuaTokenKind::TkLongComment;
303296
}
304297
_ => {
@@ -321,10 +314,7 @@ impl LuaLexer<'_> {
321314
}
322315
_ => {
323316
if !self.lexer_config.support_integer_operation() {
324-
self.errors.push(LuaParseError::syntax_error_from(
325-
&t!("integer division is not supported"),
326-
self.reader.current_range(),
327-
));
317+
self.error(|| t!("integer division is not supported"));
328318
}
329319

330320
self.reader.bump();
@@ -403,10 +393,7 @@ impl LuaLexer<'_> {
403393
}
404394
'&' => {
405395
if !self.lexer_config.support_integer_operation() {
406-
self.errors.push(LuaParseError::syntax_error_from(
407-
&t!("bitwise operation is not supported"),
408-
self.reader.current_range(),
409-
));
396+
self.error(|| t!("bitwise operation is not supported"));
410397
}
411398

412399
self.reader.bump();
@@ -426,10 +413,7 @@ impl LuaLexer<'_> {
426413
}
427414
'|' => {
428415
if !self.lexer_config.support_integer_operation() {
429-
self.errors.push(LuaParseError::syntax_error_from(
430-
&t!("bitwise operation is not supported"),
431-
self.reader.current_range(),
432-
));
416+
self.error(|| t!("bitwise operation is not supported"));
433417
}
434418

435419
self.reader.bump();
@@ -524,6 +508,47 @@ impl LuaLexer<'_> {
524508
self.reader.eat_when('=')
525509
}
526510

511+
fn lex_string(&mut self, quote: char) -> LuaTokenKind {
512+
while !self.reader.is_eof() {
513+
let ch = self.reader.current_char();
514+
if ch == quote || ch == '\n' || ch == '\r' {
515+
break;
516+
}
517+
518+
if ch != '\\' {
519+
self.reader.bump();
520+
continue;
521+
}
522+
523+
self.reader.bump();
524+
match self.reader.current_char() {
525+
'z' => {
526+
self.reader.bump();
527+
self.reader
528+
.eat_while(|c| c == ' ' || c == '\t' || c == '\r' || c == '\n');
529+
}
530+
'\r' | '\n' => {
531+
self.lex_new_line();
532+
}
533+
_ => {
534+
self.reader.bump();
535+
}
536+
}
537+
}
538+
539+
if self.reader.current_char() == quote || !self.reader.is_eof() {
540+
self.state = LuaLexerState::Normal;
541+
}
542+
543+
if self.reader.current_char() != quote {
544+
self.error(|| t!("unfinished string"));
545+
return LuaTokenKind::TkString;
546+
}
547+
548+
self.reader.bump();
549+
LuaTokenKind::TkString
550+
}
551+
527552
fn lex_long_string(&mut self, sep: usize) -> LuaTokenKind {
528553
let mut end = false;
529554
while !self.reader.is_eof() {
@@ -543,11 +568,12 @@ impl LuaLexer<'_> {
543568
}
544569
}
545570

571+
if end || !self.reader.is_eof() {
572+
self.state = LuaLexerState::Normal;
573+
}
574+
546575
if !end {
547-
self.errors.push(LuaParseError::syntax_error_from(
548-
&t!("unfinished long string or comment"),
549-
self.reader.current_range(),
550-
));
576+
self.error(|| t!("unfinished long string or comment"));
551577
}
552578

553579
LuaTokenKind::TkLongString
@@ -666,18 +692,26 @@ impl LuaLexer<'_> {
666692
}
667693

668694
if self.reader.current_char().is_alphabetic() {
669-
self.errors.push(LuaParseError::syntax_error_from(
670-
&format!(
671-
"unexpected character '{}' after number literal",
672-
self.reader.current_char()
673-
),
674-
self.reader.current_range(),
675-
));
695+
let ch = self.reader.current_char();
696+
self.error(|| format!("unexpected character '{ch}' after number literal"));
676697
}
677698

678699
match state {
679700
NumberState::Int | NumberState::Hex => LuaTokenKind::TkInt,
680701
_ => LuaTokenKind::TkFloat,
681702
}
682703
}
704+
705+
fn error<F, R>(&mut self, msg: F)
706+
where
707+
F: FnOnce() -> R,
708+
R: AsRef<str>,
709+
{
710+
if let Some(errors) = &mut self.errors {
711+
errors.push(LuaParseError::syntax_error_from(
712+
msg().as_ref(),
713+
self.reader.current_range(),
714+
))
715+
}
716+
}
683717
}

crates/emmylua_parser/src/lexer/mod.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ mod token_data;
66

77
pub use lexer_config::LexerConfig;
88
pub use lua_doc_lexer::{LuaDocLexer, LuaDocLexerState};
9-
pub use lua_lexer::LuaLexer;
9+
pub use lua_lexer::{LuaLexer, LuaLexerState};
1010
pub use token_data::LuaTokenData;
1111

1212
fn is_name_start(ch: char) -> bool {

crates/emmylua_parser/src/lexer/test.rs

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
#[cfg(test)]
22
mod tests {
3+
use crate::text::Reader;
34
use crate::{
45
LuaNonStdSymbol,
56
lexer::{LexerConfig, LuaLexer},
@@ -83,7 +84,7 @@ mod tests {
8384
"#;
8485
let config = LexerConfig::default();
8586
let mut errors: Vec<LuaParseError> = Vec::new();
86-
let mut lexer = LuaLexer::new(text, config, &mut errors);
87+
let mut lexer = LuaLexer::new(Reader::new(text), config, Some(&mut errors));
8788
let tokens = lexer.tokenize();
8889
// for token in &tokens {
8990
// println!("{:?}", token);
@@ -1141,7 +1142,7 @@ LuaTokenData { kind: TkWhitespace, range: SourceRange { start_offset: 2036, leng
11411142
]);
11421143

11431144
let mut errors: Vec<LuaParseError> = Vec::new();
1144-
let mut lexer = LuaLexer::new(text, config, &mut errors);
1145+
let mut lexer = LuaLexer::new(Reader::new(text), config, Some(&mut errors));
11451146
let tokens = lexer.tokenize();
11461147

11471148
let test_str = tokens

0 commit comments

Comments
 (0)