Skip to content

Commit fce6572

Browse files
committed
Reduce resizing of token lookahead buffer
Manually track an index into the lookahead buffer to avoid buffer resizing. (Julia's builtin array actually uses the same strategy to avoid shuffling elements in popfront!(). But an extra layer here can help as we know more about the data access.)
1 parent e650e2b commit fce6572

File tree

1 file changed

+37
-25
lines changed

1 file changed

+37
-25
lines changed

src/parse_stream.jl

Lines changed: 37 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -182,6 +182,7 @@ mutable struct ParseStream
182182
lexer::Tokenize.Lexers.Lexer{IOBuffer}
183183
# Lookahead buffer for already lexed tokens
184184
lookahead::Vector{SyntaxToken}
185+
lookahead_index::Int
185186
# Pool of stream positions for use as working space in parsing
186187
position_pool::Vector{Vector{ParseStreamPosition}}
187188
# Parser output as an ordered sequence of ranges, parent nodes after children.
@@ -207,9 +208,12 @@ mutable struct ParseStream
207208
# numbers. This means we're inexact for old dev versions but that seems
208209
# like an acceptable tradeoff.
209210
ver = (version.major, version.minor)
210-
new(text_buf, text_root, lexer,
211-
Vector{Vector{ParseStreamPosition}}(),
211+
new(text_buf,
212+
text_root,
213+
lexer,
212214
Vector{SyntaxToken}(),
215+
1,
216+
Vector{Vector{ParseStreamPosition}}(),
213217
Vector{TaggedRange}(),
214218
Vector{Diagnostic}(),
215219
next_byte,
@@ -284,19 +288,19 @@ end
284288
# Stream input interface - the peek_* family of functions
285289

286290
# Buffer several tokens ahead
287-
function _buffer_lookahead_tokens(stream::ParseStream)
291+
function _buffer_lookahead_tokens(lexer, lookahead)
288292
had_whitespace = false
289293
token_count = 0
290294
while true
291-
raw = Tokenize.Lexers.next_token(stream.lexer)
295+
raw = Tokenize.Lexers.next_token(lexer)
292296
k = TzTokens.exactkind(raw)
293297
was_whitespace = k in (K"Whitespace", K"Comment", K"NewlineWs")
294298
had_whitespace |= was_whitespace
295299
f = EMPTY_FLAGS
296300
had_whitespace && (f |= PRECEDING_WHITESPACE_FLAG)
297301
raw.dotop && (f |= DOTOP_FLAG)
298302
raw.suffix && (f |= SUFFIXED_FLAG)
299-
push!(stream.lookahead, SyntaxToken(SyntaxHead(k, f), raw.startbyte + 1, raw.endbyte + 1))
303+
push!(lookahead, SyntaxToken(SyntaxHead(k, f), raw.startbyte + 1, raw.endbyte + 1))
300304
token_count += 1
301305
if k == K"EndMarker"
302306
break
@@ -320,7 +324,7 @@ end
320324
# zero whitespace tokens before the next token. The following code is an
321325
# unrolled optimized version for that fast path. Empirically it seems we
322326
# only hit the slow path about 5% of the time here.
323-
i = 1
327+
i = stream.lookahead_index
324328
if n == 1 && i+1 <= length(stream.lookahead)
325329
if skip_newlines
326330
k = kind(stream.lookahead[i])
@@ -349,15 +353,20 @@ end
349353
end
350354

351355
@noinline function __lookahead_index(stream, n, skip_newlines)
352-
i = 1
356+
i = stream.lookahead_index
353357
while true
354358
if i > length(stream.lookahead)
355-
_buffer_lookahead_tokens(stream)
359+
n_to_delete = stream.lookahead_index-1
360+
if n_to_delete > 0.9*length(stream.lookahead)
361+
Base._deletebeg!(stream.lookahead, n_to_delete)
362+
i -= n_to_delete
363+
stream.lookahead_index = 1
364+
end
365+
_buffer_lookahead_tokens(stream.lexer, stream.lookahead)
356366
end
357367
k = kind(stream.lookahead[i])
358-
is_skipped = k (K"Whitespace", K"Comment") ||
359-
(k == K"NewlineWs" && skip_newlines)
360-
if !is_skipped
368+
if !((k == K"Whitespace" || k == K"Comment") ||
369+
(k == K"NewlineWs" && skip_newlines))
361370
if n == 1
362371
return i
363372
end
@@ -398,7 +407,7 @@ function peek_token(stream::ParseStream, n::Integer=1;
398407
end
399408
i = _lookahead_index(stream, n, skip_newlines)
400409
if !skip_whitespace
401-
i = 1
410+
i = stream.lookahead_index
402411
end
403412
return stream.lookahead[i]
404413
end
@@ -445,13 +454,13 @@ end
445454
#
446455
# Though note bump() really does both input and output
447456

448-
# Bump the next `n` tokens
457+
# Bump up until the `n`th token
449458
# flags and remap_kind are applied to any non-trivia tokens
450-
function _bump_n(stream::ParseStream, n::Integer, flags, remap_kind=K"None")
451-
if n <= 0
459+
function _bump_until_n(stream::ParseStream, n::Integer, flags, remap_kind=K"None")
460+
if n < stream.lookahead_index
452461
return
453462
end
454-
for i = 1:n
463+
for i in stream.lookahead_index:n
455464
tok = stream.lookahead[i]
456465
k = kind(tok)
457466
if k == K"EndMarker"
@@ -465,7 +474,7 @@ function _bump_n(stream::ParseStream, n::Integer, flags, remap_kind=K"None")
465474
last_byte(tok), lastindex(stream.ranges)+1)
466475
push!(stream.ranges, range)
467476
end
468-
Base._deletebeg!(stream.lookahead, n)
477+
stream.lookahead_index = n + 1
469478
stream.next_byte = last_byte(last(stream.ranges)) + 1
470479
# Defuse the time bomb
471480
stream.peek_count = 0
@@ -480,7 +489,7 @@ Shift the current token from the input to the output, adding the given flags.
480489
function bump(stream::ParseStream, flags=EMPTY_FLAGS; skip_newlines=false,
481490
error=nothing, remap_kind::Kind=K"None")
482491
emark = position(stream)
483-
_bump_n(stream, _lookahead_index(stream, 1, skip_newlines), flags, remap_kind)
492+
_bump_until_n(stream, _lookahead_index(stream, 1, skip_newlines), flags, remap_kind)
484493
if !isnothing(error)
485494
emit(stream, emark, K"error", flags, error=error)
486495
end
@@ -496,7 +505,7 @@ Bump comments and whitespace tokens preceding the next token
496505
function bump_trivia(stream::ParseStream, flags=EMPTY_FLAGS;
497506
skip_newlines=true, error=nothing)
498507
emark = position(stream)
499-
_bump_n(stream, _lookahead_index(stream, 1, skip_newlines) - 1, EMPTY_FLAGS)
508+
_bump_until_n(stream, _lookahead_index(stream, 1, skip_newlines) - 1, EMPTY_FLAGS)
500509
if !isnothing(error)
501510
emit(stream, emark, K"error", flags, error=error)
502511
end
@@ -523,11 +532,12 @@ lexing ambiguities. There's no special whitespace handling — bump any
523532
whitespace if necessary with bump_trivia.
524533
"""
525534
function bump_glue(stream::ParseStream, kind, flags, num_tokens)
535+
i = stream.lookahead_index
526536
span = TaggedRange(SyntaxHead(kind, flags), K"None",
527-
first_byte(stream.lookahead[1]),
528-
last_byte(stream.lookahead[num_tokens]),
537+
first_byte(stream.lookahead[i]),
538+
last_byte(stream.lookahead[i-1+num_tokens]),
529539
lastindex(stream.ranges) + 1)
530-
Base._deletebeg!(stream.lookahead, num_tokens)
540+
stream.lookahead_index += num_tokens
531541
push!(stream.ranges, span)
532542
stream.next_byte = last_byte(last(stream.ranges)) + 1
533543
stream.peek_count = 0
@@ -553,7 +563,8 @@ TODO: Are these the only cases? Can we replace this general utility with a
553563
simpler one which only splits preceding dots?
554564
"""
555565
function bump_split(stream::ParseStream, split_spec...)
556-
tok = popfirst!(stream.lookahead)
566+
tok = stream.lookahead[stream.lookahead_index]
567+
stream.lookahead_index += 1
557568
fbyte = first_byte(tok)
558569
for (i, (nbyte, k, f)) in enumerate(split_spec)
559570
lbyte = (i == length(split_spec)) ? last_byte(tok) : fbyte + nbyte - 1
@@ -655,8 +666,9 @@ function emit_diagnostic(stream::ParseStream; whitespace=false, kws...)
655666
if whitespace
656667
# It's the whitespace which is the error. Find the range of the current
657668
# whitespace.
658-
begin_tok_i = 1
659-
end_tok_i = is_whitespace(stream.lookahead[i]) ? i : max(1, i-1)
669+
begin_tok_i = stream.lookahead_index
670+
end_tok_i = is_whitespace(stream.lookahead[i]) ?
671+
i : max(stream.lookahead_index, i-1)
660672
end
661673
fbyte = first_byte(stream.lookahead[begin_tok_i])
662674
lbyte = last_byte(stream.lookahead[end_tok_i])

0 commit comments

Comments
 (0)