Skip to content

Commit 393b380

Browse files
committed
Rewrite grammar
1 parent 8919bb5 commit 393b380

File tree

8 files changed

+16292
-48478
lines changed

8 files changed

+16292
-48478
lines changed

grammar.js

Lines changed: 191 additions & 308 deletions
Large diffs are not rendered by default.

src/grammar.json

Lines changed: 784 additions & 1149 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

src/node-types.json

Lines changed: 506 additions & 1860 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

src/parser.c

Lines changed: 13630 additions & 44604 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

src/scanner.c

Lines changed: 239 additions & 80 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,16 @@
11
#include <tree_sitter/parser.h>
2+
#include <wctype.h>
23

34
enum TokenType
45
{
5-
SHORT_SQ_STRING_CONTENT,
6-
SHORT_DQ_STRING_CONTENT,
7-
LONG_STRING,
8-
LONG_COMMENT,
6+
COMMENT_START,
7+
COMMENT_CONTENT,
8+
COMMENT_END,
9+
STRING_START,
10+
STRING_CONTENT,
11+
STRING_END,
912
};
1013

11-
// helpers
12-
const char EOF = 0;
13-
1414
static void consume(TSLexer *lexer)
1515
{
1616
lexer->advance(lexer, false);
@@ -19,8 +19,20 @@ static void skip(TSLexer *lexer)
1919
{
2020
lexer->advance(lexer, true);
2121
}
22+
static bool consume_if(TSLexer *lexer, const int32_t character)
23+
{
24+
if (lexer->lookahead == character)
25+
{
26+
consume(lexer);
27+
return true;
28+
}
29+
30+
return false;
31+
}
32+
33+
const char SQ_STRING_DELIMITER = '\'';
34+
const char DQ_STRING_DELIMITER = '"';
2235

23-
// scanner
2436
void *tree_sitter_lua_external_scanner_create()
2537
{
2638
return NULL;
@@ -30,120 +42,267 @@ void tree_sitter_lua_external_scanner_destroy(void *payload)
3042
{
3143
}
3244

33-
unsigned tree_sitter_lua_external_scanner_serialize(
34-
void *payload,
35-
char *buffer)
45+
enum StartedToken
3646
{
37-
return 0;
47+
SHORT_COMMENT = 1,
48+
SHORT_SQ_STRING,
49+
SHORT_DQ_STRING,
50+
LONG_COMMENT,
51+
LONG_STRING,
52+
};
53+
54+
unsigned short started = 0;
55+
unsigned int depth = 0;
56+
57+
unsigned int tree_sitter_lua_external_scanner_serialize(void *payload, char *buffer)
58+
{
59+
buffer[0] = started;
60+
buffer[1] = depth;
61+
return 2;
3862
}
3963

40-
void tree_sitter_lua_external_scanner_deserialize(
41-
void *payload,
42-
const char *buffer,
43-
unsigned length)
64+
void tree_sitter_lua_external_scanner_deserialize(void *payload, const char *buffer, unsigned int length)
4465
{
66+
if (length == 2)
67+
{
68+
started = buffer[0];
69+
depth = buffer[1];
70+
}
71+
}
72+
73+
static unsigned int get_depth(TSLexer *lexer)
74+
{
75+
unsigned int current_depth = 0;
76+
while (consume_if(lexer, '='))
77+
{
78+
current_depth += 1;
79+
}
80+
81+
return current_depth;
4582
}
4683

47-
static bool is_not_new_line_or_eof(TSLexer *lexer)
84+
static bool scan_depth(TSLexer *lexer)
4885
{
49-
return lexer->lookahead != '\n' && lexer->lookahead != EOF;
86+
unsigned int remaining_depth = depth;
87+
while (remaining_depth > 0 && consume_if(lexer, '='))
88+
{
89+
remaining_depth -= 1;
90+
}
91+
92+
return remaining_depth == 0;
5093
}
5194

52-
bool tree_sitter_lua_external_scanner_scan(
53-
void *payload,
54-
TSLexer *lexer,
55-
const bool *valid_symbols)
95+
bool tree_sitter_lua_external_scanner_scan(void *payload, TSLexer *lexer, const bool *valid_symbols)
5696
{
57-
if (valid_symbols[SHORT_SQ_STRING_CONTENT] ||
58-
valid_symbols[SHORT_DQ_STRING_CONTENT])
97+
switch (started)
98+
{
99+
case SHORT_COMMENT:
59100
{
60-
const bool is_single_quote = valid_symbols[SHORT_SQ_STRING_CONTENT];
61-
const char end_quote = is_single_quote ? '\'' : '"';
101+
// try to match the short comment's end (new line or eof)
102+
if (lexer->lookahead == '\n' || lexer->eof(lexer))
103+
{
104+
if (valid_symbols[COMMENT_END])
105+
{
106+
started = 0;
62107

63-
// try to consume almost anything
64-
if (lexer->lookahead != end_quote && is_not_new_line_or_eof(lexer))
108+
lexer->result_symbol = COMMENT_END;
109+
return true;
110+
}
111+
}
112+
else if (valid_symbols[COMMENT_CONTENT])
65113
{
114+
// consume all characters till a short comment's end
66115
do
67116
{
68-
// try to consume a backslash
69-
if (lexer->lookahead == '\\')
70-
{
71-
consume(lexer);
117+
consume(lexer);
118+
} while (lexer->lookahead != '\n' && !lexer->eof(lexer));
72119

73-
// try to consume almost anything
74-
if (is_not_new_line_or_eof(lexer))
75-
{
76-
consume(lexer);
77-
}
78-
}
79-
else
120+
lexer->result_symbol = COMMENT_CONTENT;
121+
return true;
122+
}
123+
124+
break;
125+
}
126+
case SHORT_SQ_STRING:
127+
case SHORT_DQ_STRING:
128+
{
129+
// define the short string's delimiter
130+
const char delimiter = started == SHORT_SQ_STRING ? SQ_STRING_DELIMITER : DQ_STRING_DELIMITER;
131+
132+
// try to match the short string's end (" or ')
133+
if (consume_if(lexer, delimiter))
134+
{
135+
if (valid_symbols[STRING_END])
136+
{
137+
started = 0;
138+
139+
lexer->result_symbol = STRING_END;
140+
return true;
141+
}
142+
}
143+
else if (valid_symbols[STRING_CONTENT] && lexer->lookahead != '\n' && !lexer->eof(lexer))
144+
{
145+
// consume any character till a short string's end, new line or eof
146+
do
147+
{
148+
// consume any character after a backslash, unless it's a new line or eof
149+
if (consume_if(lexer, '\\') && (lexer->lookahead == '\n' || lexer->eof(lexer)))
80150
{
81-
consume(lexer);
151+
break;
82152
}
83-
// consume almost everything
84-
} while (lexer->lookahead != end_quote && is_not_new_line_or_eof(lexer));
85153

86-
lexer->result_symbol = is_single_quote
87-
? SHORT_SQ_STRING_CONTENT
88-
: SHORT_DQ_STRING_CONTENT;
154+
consume(lexer);
155+
} while (lexer->lookahead != delimiter && lexer->lookahead != '\n' && !lexer->eof(lexer));
156+
157+
lexer->result_symbol = STRING_CONTENT;
89158
return true;
90159
}
160+
161+
break;
91162
}
92-
else if (valid_symbols[LONG_STRING] || valid_symbols[LONG_COMMENT])
163+
case LONG_COMMENT:
164+
case LONG_STRING:
93165
{
94-
// try to consume a first opening bracket
95-
if (lexer->lookahead == '[')
96-
{
97-
consume(lexer);
166+
const bool is_inside_a_comment = started == LONG_COMMENT;
98167

99-
// consume any level delimiters, and store how many there are
100-
unsigned int level_count = 0;
101-
while (lexer->lookahead == '=')
168+
bool some_characters_were_consumed = false;
169+
if (is_inside_a_comment ? valid_symbols[COMMENT_END] : valid_symbols[STRING_END])
170+
{
171+
// try to match the long comment's/string's end (]=*])
172+
if (consume_if(lexer, ']'))
102173
{
103-
consume(lexer);
104-
level_count += 1;
174+
if (scan_depth(lexer) && consume_if(lexer, ']'))
175+
{
176+
started = 0;
177+
depth = 0;
178+
179+
lexer->result_symbol = is_inside_a_comment ? COMMENT_END : STRING_END;
180+
return true;
181+
}
182+
183+
some_characters_were_consumed = true;
105184
}
185+
}
106186

107-
// try to consume the last opening bracket
108-
if (lexer->lookahead == '[')
187+
if (is_inside_a_comment ? valid_symbols[COMMENT_CONTENT] : valid_symbols[STRING_CONTENT])
188+
{
189+
if (!some_characters_were_consumed)
109190
{
191+
if (lexer->eof(lexer))
192+
{
193+
break;
194+
}
195+
196+
// consume the next character as it can't start a long comment's/string's end ([)
110197
consume(lexer);
198+
}
111199

112-
// consume almost everything
113-
while (lexer->lookahead != EOF)
200+
// consume any character till a long comment's/string's end or eof
201+
while (true)
202+
{
203+
lexer->mark_end(lexer);
204+
if (consume_if(lexer, ']'))
114205
{
115-
// try to consume a first closing bracket
116-
if (lexer->lookahead == ']')
206+
if (scan_depth(lexer))
117207
{
118-
consume(lexer);
119-
120-
// try to consume every level delimiters
121-
unsigned int current_level;
122-
for (current_level = level_count;
123-
current_level > 0 && lexer->lookahead == '=';
124-
current_level -= 1)
208+
if (consume_if(lexer, ']'))
125209
{
126-
consume(lexer);
210+
break;
127211
}
212+
}
213+
else
214+
{
215+
continue;
216+
}
217+
}
218+
219+
if (lexer->eof(lexer))
220+
{
221+
break;
222+
}
223+
224+
consume(lexer);
225+
}
226+
227+
lexer->result_symbol = is_inside_a_comment ? COMMENT_CONTENT : STRING_CONTENT;
228+
return true;
229+
}
230+
231+
break;
232+
}
233+
default:
234+
{
235+
// ignore all whitespace
236+
while (iswspace(lexer->lookahead))
237+
{
238+
skip(lexer);
239+
}
240+
241+
if (valid_symbols[COMMENT_START])
242+
{
243+
// try to match a short comment's start (--)
244+
if (consume_if(lexer, '-'))
245+
{
246+
if (consume_if(lexer, '-'))
247+
{
248+
started = SHORT_COMMENT;
249+
250+
// try to match a long comment's start (--[=*[)
251+
lexer->mark_end(lexer);
252+
if (consume_if(lexer, '['))
253+
{
254+
unsigned int possible_depth = get_depth(lexer);
128255

129-
// try to consume the last closing bracket if all levels are consumed
130-
if (current_level == 0 && lexer->lookahead == ']')
256+
if (consume_if(lexer, '['))
131257
{
132-
consume(lexer);
258+
started = LONG_COMMENT;
259+
depth = possible_depth;
133260

134-
lexer->result_symbol = valid_symbols[LONG_STRING]
135-
? LONG_STRING
136-
: LONG_COMMENT;
137-
return true;
261+
lexer->mark_end(lexer);
138262
}
139263
}
140-
else
141-
{
142-
consume(lexer);
143-
}
264+
265+
lexer->result_symbol = COMMENT_START;
266+
return true;
144267
}
268+
269+
break;
145270
}
146271
}
272+
273+
if (valid_symbols[STRING_START])
274+
{
275+
// try to match a short single-quoted string's start (")
276+
if (consume_if(lexer, SQ_STRING_DELIMITER))
277+
{
278+
started = SHORT_SQ_STRING;
279+
}
280+
// try to match a short double-quoted string's start (')
281+
else if (consume_if(lexer, DQ_STRING_DELIMITER))
282+
{
283+
started = SHORT_DQ_STRING;
284+
}
285+
// try to match a long string's start ([=*[)
286+
else if (consume_if(lexer, '['))
287+
{
288+
unsigned int possible_depth = get_depth(lexer);
289+
290+
if (consume_if(lexer, '['))
291+
{
292+
started = LONG_STRING;
293+
depth = possible_depth;
294+
}
295+
}
296+
297+
if (started)
298+
{
299+
lexer->result_symbol = STRING_START;
300+
return true;
301+
}
302+
}
303+
304+
break;
305+
}
147306
}
148307

149308
return false;

0 commit comments

Comments
 (0)