Skip to content

Commit e650e2b

Browse files
committed
Optimize token buffering and peek()
This change implements a fast-path for token lookahead in peek() and increases the size of the lookahead buffer to make this more efficient.
1 parent cdedc77 commit e650e2b

File tree

1 file changed

+61
-21
lines changed

1 file changed

+61
-21
lines changed

src/parse_stream.jl

Lines changed: 61 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -108,15 +108,7 @@ Information about preceding whitespace is added for use by the parser.
108108
struct SyntaxToken
109109
head::SyntaxHead
110110
first_byte::UInt32
111-
last_byte::UInt32
112-
end
113-
114-
function SyntaxToken(raw::Token, had_whitespace)
115-
f = EMPTY_FLAGS
116-
had_whitespace && (f |= PRECEDING_WHITESPACE_FLAG)
117-
raw.dotop && (f |= DOTOP_FLAG)
118-
raw.suffix && (f |= SUFFIXED_FLAG)
119-
SyntaxToken(SyntaxHead(raw.kind, f), raw.startbyte + 1, raw.endbyte + 1)
111+
last_byte::UInt32 # TODO: Remove this?
120112
end
121113

122114
function Base.show(io::IO, tok::SyntaxToken)
@@ -137,15 +129,19 @@ is_decorated(tok::SyntaxToken) = is_dotted(tok) || is_suffixed(tok)
137129
Range in the source text which will become a node in the tree. Can be either a
138130
token (leaf node of the tree) or an interior node, depending on how the
139131
start_mark compares to previous nodes.
140-
141-
TODO: Optimize this data structure? It's very large at the moment.
142132
"""
143133
struct TaggedRange
144134
head::SyntaxHead # Kind,flags
145135
orig_kind::Kind # Kind of the original token for leaf tokens, or K"None"
146136
first_byte::UInt32 # First byte in the input text
147137
last_byte::UInt32 # Last byte in the input text
148138
start_mark::UInt32 # Index of first emitted range which this range covers
139+
# TODO: Remove the three fields above & replace with:
140+
# is_leaf::Bool
141+
# # The following field is used for one of two things:
142+
# # - For leaf nodes it points to the last byte of the token in the input text
143+
# # - For non-leaf nodes it points to the index of the first child
144+
# last_byte_or_first_child::UInt32
149145
end
150146

151147
head(range::TaggedRange) = range.head
@@ -287,28 +283,72 @@ end
287283
#-------------------------------------------------------------------------------
288284
# Stream input interface - the peek_* family of functions
289285

290-
# Buffer up until the next non-whitespace token.
291-
# This can buffer more than strictly necessary when newlines are significant,
292-
# but this is not a big problem.
286+
# Buffer several tokens ahead
293287
function _buffer_lookahead_tokens(stream::ParseStream)
294288
had_whitespace = false
289+
token_count = 0
295290
while true
296291
raw = Tokenize.Lexers.next_token(stream.lexer)
297292
k = TzTokens.exactkind(raw)
298293
was_whitespace = k in (K"Whitespace", K"Comment", K"NewlineWs")
299294
had_whitespace |= was_whitespace
300-
push!(stream.lookahead, SyntaxToken(raw, had_whitespace))
301-
if !was_whitespace
295+
f = EMPTY_FLAGS
296+
had_whitespace && (f |= PRECEDING_WHITESPACE_FLAG)
297+
raw.dotop && (f |= DOTOP_FLAG)
298+
raw.suffix && (f |= SUFFIXED_FLAG)
299+
push!(stream.lookahead, SyntaxToken(SyntaxHead(k, f), raw.startbyte + 1, raw.endbyte + 1))
300+
token_count += 1
301+
if k == K"EndMarker"
302302
break
303303
end
304+
if !was_whitespace
305+
# Buffer tokens in batches for lookahead. Generally we want a
306+
# moderate-size buffer to make sure we hit the fast path of peek(),
307+
# but not too large to avoid (a) polluting the processor cache and
308+
# (b) doing unnecessary work when not parsing the whole input.
309+
had_whitespace = false
310+
if token_count > 100
311+
break
312+
end
313+
end
304314
end
305315
end
306316

307-
# Find the index of the first nontrivia token in the lookahead buffer.
308-
#
309-
# TODO: Store this as part of _buffer_lookahead_tokens to avoid redoing this
310-
# work all the time!
311-
function _lookahead_index(stream::ParseStream, n::Integer, skip_newlines::Bool)
317+
# Find the index of the next nontrivia token
318+
@inline function _lookahead_index(stream::ParseStream, n::Integer, skip_newlines::Bool)
319+
# Much of the time we'll be peeking ahead a single token and have one or
320+
# zero whitespace tokens before the next token. The following code is an
321+
# unrolled optimized version for that fast path. Empirically it seems we
322+
# only hit the slow path about 5% of the time here.
323+
i = 1
324+
if n == 1 && i+1 <= length(stream.lookahead)
325+
if skip_newlines
326+
k = kind(stream.lookahead[i])
327+
if !(k == K"Whitespace" || k == K"Comment" || k == K"NewlineWs")
328+
return i
329+
end
330+
i += 1
331+
k = kind(stream.lookahead[i])
332+
if !(k == K"Whitespace" || k == K"Comment" || k == K"NewlineWs")
333+
return i
334+
end
335+
else
336+
k = kind(stream.lookahead[i])
337+
if !(k == K"Whitespace" || k == K"Comment")
338+
return i
339+
end
340+
i += 1
341+
k = kind(stream.lookahead[i])
342+
if !(k == K"Whitespace" || k == K"Comment")
343+
return i
344+
end
345+
end
346+
end
347+
# Fall through to the general case
348+
return __lookahead_index(stream, n, skip_newlines)
349+
end
350+
351+
@noinline function __lookahead_index(stream, n, skip_newlines)
312352
i = 1
313353
while true
314354
if i > length(stream.lookahead)

0 commit comments

Comments
 (0)