@@ -105,24 +105,27 @@ numeric_flags(x) = numeric_flags(flags(x))
105
105
# -------------------------------------------------------------------------------
106
106
"""
107
107
`SyntaxToken` is a token covering a contiguous byte range in the input text.
108
- Information about preceding whitespace is added for use by the parser.
108
+
109
+ We record only the `next_byte` here (the index of the next byte *after* the
110
+ token) to avoid duplication of data between neighbouring tokens. This is more
111
+ useful than recording the first byte, as it allows an initial fixed sentinel
112
+ token to be used for recording the first byte of the first real token.
109
113
"""
110
114
struct SyntaxToken
111
115
head:: SyntaxHead
112
116
orig_kind:: Kind
113
- first_byte :: UInt32
117
+ next_byte :: UInt32
114
118
end
115
119
116
- function SyntaxToken (head:: SyntaxHead , first_byte :: Integer )
117
- SyntaxToken (head, kind (head), first_byte )
120
+ function SyntaxToken (head:: SyntaxHead , next_byte :: Integer )
121
+ SyntaxToken (head, kind (head), next_byte )
118
122
end
119
123
120
124
function Base. show (io:: IO , tok:: SyntaxToken )
121
- print (io, rpad (untokenize (tok. head, unique= false ), 15 ), " @ " , first_byte ( tok) )
125
+ print (io, rpad (untokenize (tok. head, unique= false ), 15 ), " | " , tok. next_byte )
122
126
end
123
127
124
128
head (tok:: SyntaxToken ) = tok. head
125
- first_byte (tok:: SyntaxToken ) = tok. first_byte
126
129
127
130
128
131
# -------------------------------------------------------------------------------
@@ -200,13 +203,16 @@ mutable struct ParseStream
200
203
# numbers. This means we're inexact for old dev versions but that seems
201
204
# like an acceptable tradeoff.
202
205
ver = (version. major, version. minor)
206
+ # Initial sentinel token containing the first byte of the first real token.
207
+ sentinel = SyntaxToken (SyntaxHead (K " TOMBSTONE" , EMPTY_FLAGS),
208
+ K " TOMBSTONE" , next_byte)
203
209
new (text_buf,
204
210
text_root,
205
211
lexer,
206
212
Vector {SyntaxToken} (),
207
213
1 ,
208
214
Vector {Vector{ParseStreamPosition}} (),
209
- Vector { SyntaxToken} () ,
215
+ SyntaxToken[sentinel] ,
210
216
Vector {TaggedRange} (),
211
217
Vector {Diagnostic} (),
212
218
0 ,
@@ -282,12 +288,25 @@ function token_is_last(stream, pos)
282
288
pos. token_index > stream. ranges[pos. range_index]. last_token
283
289
end
284
290
285
- # Safely compute the first byte of a token, including the token off the end of
286
- # the stream.
291
+ # Compute the first byte of a token at given index `i`
287
292
function token_first_byte (stream, i)
288
- i == length (stream. tokens) + 1 ?
289
- _next_byte (stream) :
290
- stream. tokens[i]. first_byte
293
+ stream. tokens[i- 1 ]. next_byte
294
+ end
295
+
296
+ function token_last_byte (stream:: ParseStream , i)
297
+ stream. tokens[i]. next_byte - 1
298
+ end
299
+
300
+ function token_span (stream:: ParseStream , i)
301
+ stream. tokens[i]. next_byte - stream. tokens[i- 1 ]. next_byte
302
+ end
303
+
304
+ function lookahead_token_first_byte (stream, i)
305
+ i == 1 ? _next_byte (stream) : stream. lookahead[i- 1 ]. next_byte
306
+ end
307
+
308
+ function lookahead_token_last_byte (stream, i)
309
+ stream. lookahead[i]. next_byte - 1
291
310
end
292
311
293
312
# -------------------------------------------------------------------------------
297
316
function _buffer_lookahead_tokens (lexer, lookahead)
298
317
had_whitespace = false
299
318
token_count = 0
300
- done = false
301
319
while true
302
320
raw = Tokenize. Lexers. next_token (lexer)
303
321
k = TzTokens. exactkind (raw)
@@ -307,7 +325,7 @@ function _buffer_lookahead_tokens(lexer, lookahead)
307
325
had_whitespace && (f |= PRECEDING_WHITESPACE_FLAG)
308
326
raw. dotop && (f |= DOTOP_FLAG)
309
327
raw. suffix && (f |= SUFFIXED_FLAG)
310
- push! (lookahead, SyntaxToken (SyntaxHead (k, f), raw. startbyte + 1 ))
328
+ push! (lookahead, SyntaxToken (SyntaxHead (k, f), raw. endbyte + 2 ))
311
329
token_count += 1
312
330
if k == K " EndMarker"
313
331
break
@@ -318,31 +336,19 @@ function _buffer_lookahead_tokens(lexer, lookahead)
318
336
# but not too large to avoid (a) polluting the processor cache and
319
337
# (b) doing unnecessary work when not parsing the whole input.
320
338
had_whitespace = false
321
- if done
322
- break
323
- end
324
339
if token_count > 100
325
- # Buffer at least one token after the last so we can get the
326
- # current token's last byte based on the next token. (May need
327
- # more than one to correctly apply had_whitespace state.)
328
- done = true
340
+ break
329
341
end
330
342
end
331
343
end
332
344
end
333
345
334
346
# Return the index of the next byte of the input
335
347
function _next_byte (stream)
336
- if stream. lookahead_index > length (stream. lookahead)
337
- __lookahead_index (stream, 1 , false ) # Will buffer more tokens
338
- end
339
- stream. lookahead[stream. lookahead_index]. first_byte
348
+ last (stream. tokens). next_byte
340
349
end
341
350
342
351
# Find the index of the next nontrivia token
343
- #
344
- # Postcondition: After returning `i`, the lookahead buffer will buffers tokens
345
- # at least up until stream.lookahead[i+1]
346
352
@inline function _lookahead_index (stream:: ParseStream , n:: Integer , skip_newlines:: Bool )
347
353
# Much of the time we'll be peeking ahead a single token and have one or
348
354
# zero whitespace tokens before the next token. The following code is an
@@ -434,7 +440,7 @@ function peek_token(stream::ParseStream, n::Integer=1;
434
440
if ! skip_whitespace
435
441
i = stream. lookahead_index
436
442
end
437
- return @inbounds stream. lookahead[i]
443
+ return @inbounds head ( stream. lookahead[i])
438
444
end
439
445
440
446
@@ -459,11 +465,10 @@ function peek_full_token(stream::ParseStream, n::Integer=1;
459
465
if ! skip_whitespace
460
466
i = stream. lookahead_index
461
467
end
462
- tok = stream. lookahead[i]
468
+ t = stream. lookahead[i]
463
469
464
- FullToken (head (tok),
465
- first_byte (tok),
466
- first_byte (stream. lookahead[i+ 1 ]) - 1 )
470
+ FullToken (head (t), lookahead_token_first_byte (stream, i),
471
+ lookahead_token_last_byte (stream, i))
467
472
end
468
473
469
474
"""
@@ -541,7 +546,7 @@ function _bump_until_n(stream::ParseStream, n::Integer, flags, remap_kind=K"None
541
546
is_trivia && (f |= TRIVIA_FLAG)
542
547
outk = (is_trivia || remap_kind == K " None" ) ? k : remap_kind
543
548
h = SyntaxHead (outk, f)
544
- push! (stream. tokens, SyntaxToken (h, kind (tok), first_byte ( tok) ))
549
+ push! (stream. tokens, SyntaxToken (h, kind (tok), tok. next_byte ))
545
550
end
546
551
stream. lookahead_index = n + 1
547
552
# Defuse the time bomb
@@ -608,7 +613,7 @@ whitespace if necessary with bump_trivia.
608
613
function bump_glue (stream:: ParseStream , kind, flags, num_tokens)
609
614
i = stream. lookahead_index
610
615
h = SyntaxHead (kind, flags)
611
- push! (stream. tokens, SyntaxToken (h, stream. lookahead[i] . first_byte ))
616
+ push! (stream. tokens, SyntaxToken (h, stream. lookahead[i+ 1 ] . next_byte ))
612
617
stream. lookahead_index += num_tokens
613
618
stream. peek_count = 0
614
619
return position (stream)
@@ -635,11 +640,11 @@ simpler one which only splits preceding dots?
635
640
function bump_split (stream:: ParseStream , split_spec... )
636
641
tok = stream. lookahead[stream. lookahead_index]
637
642
stream. lookahead_index += 1
638
- fbyte = tok . first_byte
643
+ b = _next_byte (stream)
639
644
for (i, (nbyte, k, f)) in enumerate (split_spec)
640
645
h = SyntaxHead (k, f)
641
- push! (stream . tokens, SyntaxToken (h, kind (tok), fbyte))
642
- fbyte += nbyte
646
+ b = (i == length (split_spec)) ? tok . next_byte : b + nbyte
647
+ push! (stream . tokens, SyntaxToken (h, kind (tok), b))
643
648
end
644
649
stream. peek_count = 0
645
650
# Returning position(stream) like the other bump* methods would be
@@ -665,7 +670,7 @@ function reset_node!(stream::ParseStream, pos::ParseStreamPosition;
665
670
if token_is_last (stream, pos)
666
671
t = stream. tokens[pos. token_index]
667
672
stream. tokens[pos. token_index] = SyntaxToken (_reset_node_head (t, kind, flags),
668
- t. orig_kind, t. first_byte )
673
+ t. orig_kind, t. next_byte )
669
674
else
670
675
r = stream. ranges[pos. range_index]
671
676
stream. ranges[pos. range_index] = TaggedRange (_reset_node_head (r, kind, flags),
@@ -682,17 +687,17 @@ Hack alert! This is used only for managing the complicated rules related to
682
687
dedenting triple quoted strings.
683
688
"""
684
689
function steal_token_bytes! (stream:: ParseStream , pos:: ParseStreamPosition , numbytes)
685
- # Token index to modify
686
- i = pos. token_index + 1
687
- t = stream. tokens[i]
688
- # Compute new token
689
- next_byte = token_first_byte (stream, i + 1 )
690
- first_byte = t. first_byte + numbytes
691
- is_empty = first_byte >= next_byte
692
- head2 = is_empty ? SyntaxHead (K " TOMBSTONE" , EMPTY_FLAGS) : t. head
693
- stream. tokens[i] = SyntaxToken (head2, t. orig_kind, first_byte)
690
+ i = pos. token_index
691
+ t1 = stream. tokens[i]
692
+ t2 = stream. tokens[i+ 1 ]
694
693
695
- return is_empty
694
+ t1_next_byte = t1. next_byte + numbytes
695
+ stream. tokens[i] = SyntaxToken (t1. head, t1. orig_kind, t1_next_byte)
696
+
697
+ t2_is_empty = t1_next_byte == t2. next_byte
698
+ head2 = t2_is_empty ? SyntaxHead (K " TOMBSTONE" , EMPTY_FLAGS) : t2. head
699
+ stream. tokens[i+ 1 ] = SyntaxToken (head2, t2. orig_kind, t2. next_byte)
700
+ return t2_is_empty
696
701
end
697
702
698
703
function Base. position (stream:: ParseStream )
@@ -714,7 +719,7 @@ function emit(stream::ParseStream, mark::ParseStreamPosition, kind::Kind,
714
719
# The first child must be a leaf, otherwise ranges would be improperly
715
720
# nested.
716
721
fbyte = token_first_byte (stream, first_token)
717
- lbyte = _next_byte (stream) - 1
722
+ lbyte = token_last_byte (stream, lastindex (stream . tokens))
718
723
_emit_diagnostic (stream, fbyte, lbyte, error= error)
719
724
end
720
725
push! (stream. ranges, range)
@@ -745,8 +750,8 @@ function emit_diagnostic(stream::ParseStream; whitespace=false, kws...)
745
750
end_tok_i = is_whitespace (stream. lookahead[i]) ?
746
751
i : max (stream. lookahead_index, i - 1 )
747
752
end
748
- fbyte = first_byte (stream. lookahead[ begin_tok_i] )
749
- lbyte = first_byte (stream. lookahead[end_tok_i + 1 ]) - 1
753
+ fbyte = lookahead_token_first_byte (stream, begin_tok_i)
754
+ lbyte = lookahead_token_last_byte (stream, end_tok_i)
750
755
_emit_diagnostic (stream, fbyte, lbyte; kws... )
751
756
return nothing
752
757
end
@@ -808,8 +813,7 @@ function build_tree(::Type{NodeType}, stream::ParseStream;
808
813
i += 1
809
814
continue # Ignore removed tokens
810
815
end
811
- next_byte = token_first_byte (stream, i + 1 )
812
- node = NodeType (head (t), next_byte - t. first_byte)
816
+ node = NodeType (head (t), token_span (stream, i))
813
817
push! (stack, (first_token= i, node= node))
814
818
i += 1
815
819
end
@@ -881,9 +885,15 @@ Return the `Vector{UInt8}` text buffer being parsed by this `ParseStream`.
881
885
"""
882
886
textbuf (stream) = stream. textbuf
883
887
884
- function first_byte (stream:: ParseStream )
885
- isempty (stream. tokens) ? _next_byte (stream) : first_byte (first (stream. tokens))
886
- end
887
-
888
+ first_byte (stream:: ParseStream ) = first (stream. tokens). next_byte # Use sentinel token
888
889
last_byte (stream:: ParseStream ) = _next_byte (stream)- 1
889
890
any_error (stream:: ParseStream ) = any_error (stream. diagnostics)
891
+
892
+ function Base. empty! (stream:: ParseStream )
893
+ t = last (stream. tokens)
894
+ empty! (stream. tokens)
895
+ # Restore sentinel token
896
+ push! (stream. tokens, SyntaxToken (SyntaxHead (K " TOMBSTONE" ,EMPTY_FLAGS),
897
+ K " TOMBSTONE" , t. next_byte))
898
+ empty! (stream. ranges)
899
+ end
0 commit comments