11#include <tree_sitter/parser.h>
2+ #include <wctype.h>
23
34enum TokenType
45{
5- SHORT_SQ_STRING_CONTENT ,
6- SHORT_DQ_STRING_CONTENT ,
7- LONG_STRING ,
8- LONG_COMMENT ,
6+ COMMENT_START ,
7+ COMMENT_CONTENT ,
8+ COMMENT_END ,
9+ STRING_START ,
10+ STRING_CONTENT ,
11+ STRING_END ,
912};
1013
11- // helpers
12- const char EOF = 0 ;
13-
1414static void consume (TSLexer * lexer )
1515{
1616 lexer -> advance (lexer , false);
@@ -19,8 +19,20 @@ static void skip(TSLexer *lexer)
1919{
2020 lexer -> advance (lexer , true);
2121}
22+ static bool consume_if (TSLexer * lexer , const int32_t character )
23+ {
24+ if (lexer -> lookahead == character )
25+ {
26+ consume (lexer );
27+ return true;
28+ }
29+
30+ return false;
31+ }
32+
33+ const char SQ_STRING_DELIMITER = '\'' ;
34+ const char DQ_STRING_DELIMITER = '"' ;
2235
23- // scanner
2436void * tree_sitter_lua_external_scanner_create ()
2537{
2638 return NULL ;
@@ -30,120 +42,267 @@ void tree_sitter_lua_external_scanner_destroy(void *payload)
3042{
3143}
3244
33- unsigned tree_sitter_lua_external_scanner_serialize (
34- void * payload ,
35- char * buffer )
45+ enum StartedToken
3646{
37- return 0 ;
47+ SHORT_COMMENT = 1 ,
48+ SHORT_SQ_STRING ,
49+ SHORT_DQ_STRING ,
50+ LONG_COMMENT ,
51+ LONG_STRING ,
52+ };
53+
54+ unsigned short started = 0 ;
55+ unsigned int depth = 0 ;
56+
57+ unsigned int tree_sitter_lua_external_scanner_serialize (void * payload , char * buffer )
58+ {
59+ buffer [0 ] = started ;
60+ buffer [1 ] = depth ;
61+ return 2 ;
3862}
3963
40- void tree_sitter_lua_external_scanner_deserialize (
41- void * payload ,
42- const char * buffer ,
43- unsigned length )
64+ void tree_sitter_lua_external_scanner_deserialize (void * payload , const char * buffer , unsigned int length )
4465{
66+ if (length == 2 )
67+ {
68+ started = buffer [0 ];
69+ depth = buffer [1 ];
70+ }
71+ }
72+
73+ static unsigned int get_depth (TSLexer * lexer )
74+ {
75+ unsigned int current_depth = 0 ;
76+ while (consume_if (lexer , '=' ))
77+ {
78+ current_depth += 1 ;
79+ }
80+
81+ return current_depth ;
4582}
4683
47- static bool is_not_new_line_or_eof (TSLexer * lexer )
84+ static bool scan_depth (TSLexer * lexer )
4885{
49- return lexer -> lookahead != '\n' && lexer -> lookahead != EOF ;
86+ unsigned int remaining_depth = depth ;
87+ while (remaining_depth > 0 && consume_if (lexer , '=' ))
88+ {
89+ remaining_depth -= 1 ;
90+ }
91+
92+ return remaining_depth == 0 ;
5093}
5194
52- bool tree_sitter_lua_external_scanner_scan (
53- void * payload ,
54- TSLexer * lexer ,
55- const bool * valid_symbols )
95+ bool tree_sitter_lua_external_scanner_scan (void * payload , TSLexer * lexer , const bool * valid_symbols )
5696{
57- if (valid_symbols [SHORT_SQ_STRING_CONTENT ] ||
58- valid_symbols [SHORT_DQ_STRING_CONTENT ])
97+ switch (started )
98+ {
99+ case SHORT_COMMENT :
59100 {
60- const bool is_single_quote = valid_symbols [SHORT_SQ_STRING_CONTENT ];
61- const char end_quote = is_single_quote ? '\'' : '"' ;
101+ // try to match the short comment's end (new line or eof)
102+ if (lexer -> lookahead == '\n' || lexer -> eof (lexer ))
103+ {
104+ if (valid_symbols [COMMENT_END ])
105+ {
106+ started = 0 ;
62107
63- // try to consume almost anything
64- if (lexer -> lookahead != end_quote && is_not_new_line_or_eof (lexer ))
108+ lexer -> result_symbol = COMMENT_END ;
109+ return true;
110+ }
111+ }
112+ else if (valid_symbols [COMMENT_CONTENT ])
65113 {
114+ // consume all characters till a short comment's end
66115 do
67116 {
68- // try to consume a backslash
69- if (lexer -> lookahead == '\\' )
70- {
71- consume (lexer );
117+ consume (lexer );
118+ } while (lexer -> lookahead != '\n' && !lexer -> eof (lexer ));
72119
73- // try to consume almost anything
74- if (is_not_new_line_or_eof (lexer ))
75- {
76- consume (lexer );
77- }
78- }
79- else
120+ lexer -> result_symbol = COMMENT_CONTENT ;
121+ return true;
122+ }
123+
124+ break ;
125+ }
126+ case SHORT_SQ_STRING :
127+ case SHORT_DQ_STRING :
128+ {
129+ // define the short string's delimiter
130+ const char delimiter = started == SHORT_SQ_STRING ? SQ_STRING_DELIMITER : DQ_STRING_DELIMITER ;
131+
132+ // try to match the short string's end (" or ')
133+ if (consume_if (lexer , delimiter ))
134+ {
135+ if (valid_symbols [STRING_END ])
136+ {
137+ started = 0 ;
138+
139+ lexer -> result_symbol = STRING_END ;
140+ return true;
141+ }
142+ }
143+ else if (valid_symbols [STRING_CONTENT ] && lexer -> lookahead != '\n' && !lexer -> eof (lexer ))
144+ {
145+ // consume any character till a short string's end, new line or eof
146+ do
147+ {
148+ // consume any character after a backslash, unless it's a new line or eof
149+ if (consume_if (lexer , '\\' ) && (lexer -> lookahead == '\n' || lexer -> eof (lexer )))
80150 {
81- consume ( lexer ) ;
151+ break ;
82152 }
83- // consume almost everything
84- } while (lexer -> lookahead != end_quote && is_not_new_line_or_eof (lexer ));
85153
86- lexer -> result_symbol = is_single_quote
87- ? SHORT_SQ_STRING_CONTENT
88- : SHORT_DQ_STRING_CONTENT ;
154+ consume (lexer );
155+ } while (lexer -> lookahead != delimiter && lexer -> lookahead != '\n' && !lexer -> eof (lexer ));
156+
157+ lexer -> result_symbol = STRING_CONTENT ;
89158 return true;
90159 }
160+
161+ break ;
91162 }
92- else if (valid_symbols [LONG_STRING ] || valid_symbols [LONG_COMMENT ])
163+ case LONG_COMMENT :
164+ case LONG_STRING :
93165 {
94- // try to consume a first opening bracket
95- if (lexer -> lookahead == '[' )
96- {
97- consume (lexer );
166+ const bool is_inside_a_comment = started == LONG_COMMENT ;
98167
99- // consume any level delimiters, and store how many there are
100- unsigned int level_count = 0 ;
101- while (lexer -> lookahead == '=' )
168+ bool some_characters_were_consumed = false;
169+ if (is_inside_a_comment ? valid_symbols [COMMENT_END ] : valid_symbols [STRING_END ])
170+ {
171+ // try to match the long comment's/string's end (]=*])
172+ if (consume_if (lexer , ']' ))
102173 {
103- consume (lexer );
104- level_count += 1 ;
174+ if (scan_depth (lexer ) && consume_if (lexer , ']' ))
175+ {
176+ started = 0 ;
177+ depth = 0 ;
178+
179+ lexer -> result_symbol = is_inside_a_comment ? COMMENT_END : STRING_END ;
180+ return true;
181+ }
182+
183+ some_characters_were_consumed = true;
105184 }
185+ }
106186
107- // try to consume the last opening bracket
108- if (lexer -> lookahead == '[' )
187+ if (is_inside_a_comment ? valid_symbols [COMMENT_CONTENT ] : valid_symbols [STRING_CONTENT ])
188+ {
189+ if (!some_characters_were_consumed )
109190 {
191+ if (lexer -> eof (lexer ))
192+ {
193+ break ;
194+ }
195+
196+ // consume the next character as it can't start a long comment's/string's end ([)
110197 consume (lexer );
198+ }
111199
112- // consume almost everything
113- while (lexer -> lookahead != EOF )
200+ // consume any character till a long comment's/string's end or eof
201+ while (true)
202+ {
203+ lexer -> mark_end (lexer );
204+ if (consume_if (lexer , ']' ))
114205 {
115- // try to consume a first closing bracket
116- if (lexer -> lookahead == ']' )
206+ if (scan_depth (lexer ))
117207 {
118- consume (lexer );
119-
120- // try to consume every level delimiters
121- unsigned int current_level ;
122- for (current_level = level_count ;
123- current_level > 0 && lexer -> lookahead == '=' ;
124- current_level -= 1 )
208+ if (consume_if (lexer , ']' ))
125209 {
126- consume ( lexer ) ;
210+ break ;
127211 }
212+ }
213+ else
214+ {
215+ continue ;
216+ }
217+ }
218+
219+ if (lexer -> eof (lexer ))
220+ {
221+ break ;
222+ }
223+
224+ consume (lexer );
225+ }
226+
227+ lexer -> result_symbol = is_inside_a_comment ? COMMENT_CONTENT : STRING_CONTENT ;
228+ return true;
229+ }
230+
231+ break ;
232+ }
233+ default :
234+ {
235+ // ignore all whitespace
236+ while (iswspace (lexer -> lookahead ))
237+ {
238+ skip (lexer );
239+ }
240+
241+ if (valid_symbols [COMMENT_START ])
242+ {
243+ // try to match a short comment's start (--)
244+ if (consume_if (lexer , '-' ))
245+ {
246+ if (consume_if (lexer , '-' ))
247+ {
248+ started = SHORT_COMMENT ;
249+
250+ // try to match a long comment's start (--[=*[)
251+ lexer -> mark_end (lexer );
252+ if (consume_if (lexer , '[' ))
253+ {
254+ unsigned int possible_depth = get_depth (lexer );
128255
129- // try to consume the last closing bracket if all levels are consumed
130- if (current_level == 0 && lexer -> lookahead == ']' )
256+ if (consume_if (lexer , '[' ))
131257 {
132- consume (lexer );
258+ started = LONG_COMMENT ;
259+ depth = possible_depth ;
133260
134- lexer -> result_symbol = valid_symbols [LONG_STRING ]
135- ? LONG_STRING
136- : LONG_COMMENT ;
137- return true;
261+ lexer -> mark_end (lexer );
138262 }
139263 }
140- else
141- {
142- consume (lexer );
143- }
264+
265+ lexer -> result_symbol = COMMENT_START ;
266+ return true;
144267 }
268+
269+ break ;
145270 }
146271 }
272+
273+ if (valid_symbols [STRING_START ])
274+ {
275+ // try to match a short single-quoted string's start (")
276+ if (consume_if (lexer , SQ_STRING_DELIMITER ))
277+ {
278+ started = SHORT_SQ_STRING ;
279+ }
280+ // try to match a short double-quoted string's start (')
281+ else if (consume_if (lexer , DQ_STRING_DELIMITER ))
282+ {
283+ started = SHORT_DQ_STRING ;
284+ }
285+ // try to match a long string's start ([=*[)
286+ else if (consume_if (lexer , '[' ))
287+ {
288+ unsigned int possible_depth = get_depth (lexer );
289+
290+ if (consume_if (lexer , '[' ))
291+ {
292+ started = LONG_STRING ;
293+ depth = possible_depth ;
294+ }
295+ }
296+
297+ if (started )
298+ {
299+ lexer -> result_symbol = STRING_START ;
300+ return true;
301+ }
302+ }
303+
304+ break ;
305+ }
147306 }
148307
149308 return false;
0 commit comments