Skip to content

Commit 98bd80c

Browse files
authored
Cleanup: put next_byte in token rather than first_byte (#21)
While a bit counter-intuitive, this enables us to use an initial sentinel token for recording the first byte of the first real token which removes a bunch of special case hacks for computing the last byte of the current output token. Also return only SyntaxHead from peek_token() - the byte range is never needed.
1 parent 2b4f52e commit 98bd80c

File tree

2 files changed

+69
-60
lines changed

2 files changed

+69
-60
lines changed

src/parse_stream.jl

Lines changed: 68 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -105,24 +105,27 @@ numeric_flags(x) = numeric_flags(flags(x))
105105
#-------------------------------------------------------------------------------
106106
"""
107107
`SyntaxToken` is a token covering a contiguous byte range in the input text.
108-
Information about preceding whitespace is added for use by the parser.
108+
109+
We record only the `next_byte` here (the index of the next byte *after* the
110+
token) to avoid duplication of data between neighbouring tokens. This is more
111+
useful than recording the first byte, as it allows an initial fixed sentinel
112+
token to be used for recording the first byte of the first real token.
109113
"""
110114
struct SyntaxToken
111115
head::SyntaxHead
112116
orig_kind::Kind
113-
first_byte::UInt32
117+
next_byte::UInt32
114118
end
115119

116-
function SyntaxToken(head::SyntaxHead, first_byte::Integer)
117-
SyntaxToken(head, kind(head), first_byte)
120+
function SyntaxToken(head::SyntaxHead, next_byte::Integer)
121+
SyntaxToken(head, kind(head), next_byte)
118122
end
119123

120124
function Base.show(io::IO, tok::SyntaxToken)
121-
print(io, rpad(untokenize(tok.head, unique=false), 15), " @", first_byte(tok))
125+
print(io, rpad(untokenize(tok.head, unique=false), 15), " |", tok.next_byte)
122126
end
123127

124128
head(tok::SyntaxToken) = tok.head
125-
first_byte(tok::SyntaxToken) = tok.first_byte
126129

127130

128131
#-------------------------------------------------------------------------------
@@ -200,13 +203,16 @@ mutable struct ParseStream
200203
# numbers. This means we're inexact for old dev versions but that seems
201204
# like an acceptable tradeoff.
202205
ver = (version.major, version.minor)
206+
# Initial sentinel token containing the first byte of the first real token.
207+
sentinel = SyntaxToken(SyntaxHead(K"TOMBSTONE", EMPTY_FLAGS),
208+
K"TOMBSTONE", next_byte)
203209
new(text_buf,
204210
text_root,
205211
lexer,
206212
Vector{SyntaxToken}(),
207213
1,
208214
Vector{Vector{ParseStreamPosition}}(),
209-
Vector{SyntaxToken}(),
215+
SyntaxToken[sentinel],
210216
Vector{TaggedRange}(),
211217
Vector{Diagnostic}(),
212218
0,
@@ -282,12 +288,25 @@ function token_is_last(stream, pos)
282288
pos.token_index > stream.ranges[pos.range_index].last_token
283289
end
284290

285-
# Safely compute the first byte of a token, including the token off the end of
286-
# the stream.
291+
# Compute the first byte of a token at given index `i`
287292
function token_first_byte(stream, i)
288-
i == length(stream.tokens) + 1 ?
289-
_next_byte(stream) :
290-
stream.tokens[i].first_byte
293+
stream.tokens[i-1].next_byte
294+
end
295+
296+
function token_last_byte(stream::ParseStream, i)
297+
stream.tokens[i].next_byte - 1
298+
end
299+
300+
function token_span(stream::ParseStream, i)
301+
stream.tokens[i].next_byte - stream.tokens[i-1].next_byte
302+
end
303+
304+
function lookahead_token_first_byte(stream, i)
305+
i == 1 ? _next_byte(stream) : stream.lookahead[i-1].next_byte
306+
end
307+
308+
function lookahead_token_last_byte(stream, i)
309+
stream.lookahead[i].next_byte - 1
291310
end
292311

293312
#-------------------------------------------------------------------------------
@@ -297,7 +316,6 @@ end
297316
function _buffer_lookahead_tokens(lexer, lookahead)
298317
had_whitespace = false
299318
token_count = 0
300-
done = false
301319
while true
302320
raw = Tokenize.Lexers.next_token(lexer)
303321
k = TzTokens.exactkind(raw)
@@ -307,7 +325,7 @@ function _buffer_lookahead_tokens(lexer, lookahead)
307325
had_whitespace && (f |= PRECEDING_WHITESPACE_FLAG)
308326
raw.dotop && (f |= DOTOP_FLAG)
309327
raw.suffix && (f |= SUFFIXED_FLAG)
310-
push!(lookahead, SyntaxToken(SyntaxHead(k, f), raw.startbyte + 1))
328+
push!(lookahead, SyntaxToken(SyntaxHead(k, f), raw.endbyte + 2))
311329
token_count += 1
312330
if k == K"EndMarker"
313331
break
@@ -318,31 +336,19 @@ function _buffer_lookahead_tokens(lexer, lookahead)
318336
# but not too large to avoid (a) polluting the processor cache and
319337
# (b) doing unnecessary work when not parsing the whole input.
320338
had_whitespace = false
321-
if done
322-
break
323-
end
324339
if token_count > 100
325-
# Buffer at least one token after the last so we can get the
326-
# current token's last byte based on the next token. (May need
327-
# more than one to correctly apply had_whitespace state.)
328-
done = true
340+
break
329341
end
330342
end
331343
end
332344
end
333345

334346
# Return the index of the next byte of the input
335347
function _next_byte(stream)
336-
if stream.lookahead_index > length(stream.lookahead)
337-
__lookahead_index(stream, 1, false) # Will buffer more tokens
338-
end
339-
stream.lookahead[stream.lookahead_index].first_byte
348+
last(stream.tokens).next_byte
340349
end
341350

342351
# Find the index of the next nontrivia token
343-
#
344-
# Postcondition: After returning `i`, the lookahead buffer will buffers tokens
345-
# at least up until stream.lookahead[i+1]
346352
@inline function _lookahead_index(stream::ParseStream, n::Integer, skip_newlines::Bool)
347353
# Much of the time we'll be peeking ahead a single token and have one or
348354
# zero whitespace tokens before the next token. The following code is an
@@ -434,7 +440,7 @@ function peek_token(stream::ParseStream, n::Integer=1;
434440
if !skip_whitespace
435441
i = stream.lookahead_index
436442
end
437-
return @inbounds stream.lookahead[i]
443+
return @inbounds head(stream.lookahead[i])
438444
end
439445

440446

@@ -459,11 +465,10 @@ function peek_full_token(stream::ParseStream, n::Integer=1;
459465
if !skip_whitespace
460466
i = stream.lookahead_index
461467
end
462-
tok = stream.lookahead[i]
468+
t = stream.lookahead[i]
463469

464-
FullToken(head(tok),
465-
first_byte(tok),
466-
first_byte(stream.lookahead[i+1]) - 1)
470+
FullToken(head(t), lookahead_token_first_byte(stream, i),
471+
lookahead_token_last_byte(stream, i))
467472
end
468473

469474
"""
@@ -541,7 +546,7 @@ function _bump_until_n(stream::ParseStream, n::Integer, flags, remap_kind=K"None
541546
is_trivia && (f |= TRIVIA_FLAG)
542547
outk = (is_trivia || remap_kind == K"None") ? k : remap_kind
543548
h = SyntaxHead(outk, f)
544-
push!(stream.tokens, SyntaxToken(h, kind(tok), first_byte(tok)))
549+
push!(stream.tokens, SyntaxToken(h, kind(tok), tok.next_byte))
545550
end
546551
stream.lookahead_index = n + 1
547552
# Defuse the time bomb
@@ -608,7 +613,7 @@ whitespace if necessary with bump_trivia.
608613
function bump_glue(stream::ParseStream, kind, flags, num_tokens)
609614
i = stream.lookahead_index
610615
h = SyntaxHead(kind, flags)
611-
push!(stream.tokens, SyntaxToken(h, stream.lookahead[i].first_byte))
616+
push!(stream.tokens, SyntaxToken(h, stream.lookahead[i+1].next_byte))
612617
stream.lookahead_index += num_tokens
613618
stream.peek_count = 0
614619
return position(stream)
@@ -635,11 +640,11 @@ simpler one which only splits preceding dots?
635640
function bump_split(stream::ParseStream, split_spec...)
636641
tok = stream.lookahead[stream.lookahead_index]
637642
stream.lookahead_index += 1
638-
fbyte = tok.first_byte
643+
b = _next_byte(stream)
639644
for (i, (nbyte, k, f)) in enumerate(split_spec)
640645
h = SyntaxHead(k, f)
641-
push!(stream.tokens, SyntaxToken(h, kind(tok), fbyte))
642-
fbyte += nbyte
646+
b = (i == length(split_spec)) ? tok.next_byte : b + nbyte
647+
push!(stream.tokens, SyntaxToken(h, kind(tok), b))
643648
end
644649
stream.peek_count = 0
645650
# Returning position(stream) like the other bump* methods would be
@@ -665,7 +670,7 @@ function reset_node!(stream::ParseStream, pos::ParseStreamPosition;
665670
if token_is_last(stream, pos)
666671
t = stream.tokens[pos.token_index]
667672
stream.tokens[pos.token_index] = SyntaxToken(_reset_node_head(t, kind, flags),
668-
t.orig_kind, t.first_byte)
673+
t.orig_kind, t.next_byte)
669674
else
670675
r = stream.ranges[pos.range_index]
671676
stream.ranges[pos.range_index] = TaggedRange(_reset_node_head(r, kind, flags),
@@ -682,17 +687,17 @@ Hack alert! This is used only for managing the complicated rules related to
682687
dedenting triple quoted strings.
683688
"""
684689
function steal_token_bytes!(stream::ParseStream, pos::ParseStreamPosition, numbytes)
685-
# Token index to modify
686-
i = pos.token_index + 1
687-
t = stream.tokens[i]
688-
# Compute new token
689-
next_byte = token_first_byte(stream, i + 1)
690-
first_byte = t.first_byte + numbytes
691-
is_empty = first_byte >= next_byte
692-
head2 = is_empty ? SyntaxHead(K"TOMBSTONE", EMPTY_FLAGS) : t.head
693-
stream.tokens[i] = SyntaxToken(head2, t.orig_kind, first_byte)
690+
i = pos.token_index
691+
t1 = stream.tokens[i]
692+
t2 = stream.tokens[i+1]
694693

695-
return is_empty
694+
t1_next_byte = t1.next_byte + numbytes
695+
stream.tokens[i] = SyntaxToken(t1.head, t1.orig_kind, t1_next_byte)
696+
697+
t2_is_empty = t1_next_byte == t2.next_byte
698+
head2 = t2_is_empty ? SyntaxHead(K"TOMBSTONE", EMPTY_FLAGS) : t2.head
699+
stream.tokens[i+1] = SyntaxToken(head2, t2.orig_kind, t2.next_byte)
700+
return t2_is_empty
696701
end
697702

698703
function Base.position(stream::ParseStream)
@@ -714,7 +719,7 @@ function emit(stream::ParseStream, mark::ParseStreamPosition, kind::Kind,
714719
# The first child must be a leaf, otherwise ranges would be improperly
715720
# nested.
716721
fbyte = token_first_byte(stream, first_token)
717-
lbyte = _next_byte(stream) - 1
722+
lbyte = token_last_byte(stream, lastindex(stream.tokens))
718723
_emit_diagnostic(stream, fbyte, lbyte, error=error)
719724
end
720725
push!(stream.ranges, range)
@@ -745,8 +750,8 @@ function emit_diagnostic(stream::ParseStream; whitespace=false, kws...)
745750
end_tok_i = is_whitespace(stream.lookahead[i]) ?
746751
i : max(stream.lookahead_index, i - 1)
747752
end
748-
fbyte = first_byte(stream.lookahead[begin_tok_i])
749-
lbyte = first_byte(stream.lookahead[end_tok_i + 1]) - 1
753+
fbyte = lookahead_token_first_byte(stream, begin_tok_i)
754+
lbyte = lookahead_token_last_byte(stream, end_tok_i)
750755
_emit_diagnostic(stream, fbyte, lbyte; kws...)
751756
return nothing
752757
end
@@ -808,8 +813,7 @@ function build_tree(::Type{NodeType}, stream::ParseStream;
808813
i += 1
809814
continue # Ignore removed tokens
810815
end
811-
next_byte = token_first_byte(stream, i + 1)
812-
node = NodeType(head(t), next_byte - t.first_byte)
816+
node = NodeType(head(t), token_span(stream, i))
813817
push!(stack, (first_token=i, node=node))
814818
i += 1
815819
end
@@ -881,9 +885,15 @@ Return the `Vector{UInt8}` text buffer being parsed by this `ParseStream`.
881885
"""
882886
textbuf(stream) = stream.textbuf
883887

884-
function first_byte(stream::ParseStream)
885-
isempty(stream.tokens) ? _next_byte(stream) : first_byte(first(stream.tokens))
886-
end
887-
888+
first_byte(stream::ParseStream) = first(stream.tokens).next_byte # Use sentinel token
888889
last_byte(stream::ParseStream) = _next_byte(stream)-1
889890
any_error(stream::ParseStream) = any_error(stream.diagnostics)
891+
892+
function Base.empty!(stream::ParseStream)
893+
t = last(stream.tokens)
894+
empty!(stream.tokens)
895+
# Restore sentinel token
896+
push!(stream.tokens, SyntaxToken(SyntaxHead(K"TOMBSTONE",EMPTY_FLAGS),
897+
K"TOMBSTONE", t.next_byte))
898+
empty!(stream.ranges)
899+
end

src/parser_api.jl

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -148,8 +148,7 @@ function parseall(::Type{T}, input...; rule=:toplevel, version=VERSION,
148148
stream = ParseStream(input...; version=version)
149149
if ignore_trivia && rule != :toplevel
150150
bump_trivia(stream, skip_newlines=true)
151-
empty!(stream.tokens)
152-
empty!(stream.ranges)
151+
empty!(stream)
153152
end
154153
parse(stream; rule=rule)
155154
if (ignore_trivia && peek(stream, skip_newlines=true) != K"EndMarker") ||

0 commit comments

Comments
 (0)