@@ -108,15 +108,7 @@ Information about preceding whitespace is added for use by the parser.
108
108
struct SyntaxToken
109
109
head:: SyntaxHead
110
110
first_byte:: UInt32
111
- last_byte:: UInt32
112
- end
113
-
114
- function SyntaxToken (raw:: Token , had_whitespace)
115
- f = EMPTY_FLAGS
116
- had_whitespace && (f |= PRECEDING_WHITESPACE_FLAG)
117
- raw. dotop && (f |= DOTOP_FLAG)
118
- raw. suffix && (f |= SUFFIXED_FLAG)
119
- SyntaxToken (SyntaxHead (raw. kind, f), raw. startbyte + 1 , raw. endbyte + 1 )
111
+ last_byte:: UInt32 # TODO : Remove this?
120
112
end
121
113
122
114
function Base. show (io:: IO , tok:: SyntaxToken )
@@ -137,15 +129,19 @@ is_decorated(tok::SyntaxToken) = is_dotted(tok) || is_suffixed(tok)
137
129
Range in the source text which will become a node in the tree. Can be either a
138
130
token (leaf node of the tree) or an interior node, depending on how the
139
131
start_mark compares to previous nodes.
140
-
141
- TODO: Optimize this data structure? It's very large at the moment.
142
132
"""
143
133
struct TaggedRange
144
134
head:: SyntaxHead # Kind,flags
145
135
orig_kind:: Kind # Kind of the original token for leaf tokens, or K"None"
146
136
first_byte:: UInt32 # First byte in the input text
147
137
last_byte:: UInt32 # Last byte in the input text
148
138
start_mark:: UInt32 # Index of first emitted range which this range covers
139
+ # TODO : Remove the three fields above & replace with:
140
+ # is_leaf::Bool
141
+ # # The following field is used for one of two things:
142
+ # # - For leaf nodes it points to the last byte of the token in the input text
143
+ # # - For non-leaf nodes it points to the index of the first child
144
+ # last_byte_or_first_child::UInt32
149
145
end
150
146
151
147
head (range:: TaggedRange ) = range. head
@@ -287,28 +283,72 @@ end
287
283
# -------------------------------------------------------------------------------
288
284
# Stream input interface - the peek_* family of functions
289
285
290
- # Buffer up until the next non-whitespace token.
291
- # This can buffer more than strictly necessary when newlines are significant,
292
- # but this is not a big problem.
286
+ # Buffer several tokens ahead
293
287
function _buffer_lookahead_tokens (stream:: ParseStream )
294
288
had_whitespace = false
289
+ token_count = 0
295
290
while true
296
291
raw = Tokenize. Lexers. next_token (stream. lexer)
297
292
k = TzTokens. exactkind (raw)
298
293
was_whitespace = k in (K " Whitespace" , K " Comment" , K " NewlineWs" )
299
294
had_whitespace |= was_whitespace
300
- push! (stream. lookahead, SyntaxToken (raw, had_whitespace))
301
- if ! was_whitespace
295
+ f = EMPTY_FLAGS
296
+ had_whitespace && (f |= PRECEDING_WHITESPACE_FLAG)
297
+ raw. dotop && (f |= DOTOP_FLAG)
298
+ raw. suffix && (f |= SUFFIXED_FLAG)
299
+ push! (stream. lookahead, SyntaxToken (SyntaxHead (k, f), raw. startbyte + 1 , raw. endbyte + 1 ))
300
+ token_count += 1
301
+ if k == K " EndMarker"
302
302
break
303
303
end
304
+ if ! was_whitespace
305
+ # Buffer tokens in batches for lookahead. Generally we want a
306
+ # moderate-size buffer to make sure we hit the fast path of peek(),
307
+ # but not too large to avoid (a) polluting the processor cache and
308
+ # (b) doing unnecessary work when not parsing the whole input.
309
+ had_whitespace = false
310
+ if token_count > 100
311
+ break
312
+ end
313
+ end
304
314
end
305
315
end
306
316
307
- # Find the index of the first nontrivia token in the lookahead buffer.
308
- #
309
- # TODO : Store this as part of _buffer_lookahead_tokens to avoid redoing this
310
- # work all the time!
311
- function _lookahead_index (stream:: ParseStream , n:: Integer , skip_newlines:: Bool )
317
+ # Find the index of the next nontrivia token
318
+ @inline function _lookahead_index (stream:: ParseStream , n:: Integer , skip_newlines:: Bool )
319
+ # Much of the time we'll be peeking ahead a single token and have one or
320
+ # zero whitespace tokens before the next token. The following code is an
321
+ # unrolled optimized version for that fast path. Empirically it seems we
322
+ # only hit the slow path about 5% of the time here.
323
+ i = 1
324
+ if n == 1 && i+ 1 <= length (stream. lookahead)
325
+ if skip_newlines
326
+ k = kind (stream. lookahead[i])
327
+ if ! (k == K " Whitespace" || k == K " Comment" || k == K " NewlineWs" )
328
+ return i
329
+ end
330
+ i += 1
331
+ k = kind (stream. lookahead[i])
332
+ if ! (k == K " Whitespace" || k == K " Comment" || k == K " NewlineWs" )
333
+ return i
334
+ end
335
+ else
336
+ k = kind (stream. lookahead[i])
337
+ if ! (k == K " Whitespace" || k == K " Comment" )
338
+ return i
339
+ end
340
+ i += 1
341
+ k = kind (stream. lookahead[i])
342
+ if ! (k == K " Whitespace" || k == K " Comment" )
343
+ return i
344
+ end
345
+ end
346
+ end
347
+ # Fall through to the general case
348
+ return __lookahead_index (stream, n, skip_newlines)
349
+ end
350
+
351
+ @noinline function __lookahead_index (stream, n, skip_newlines)
312
352
i = 1
313
353
while true
314
354
if i > length (stream. lookahead)
0 commit comments