Skip to content

Commit 5bb321a

Browse files
authored
Construct Expr directly from ParseStream (#268)
Generalize `build_tree` so that we can more easily construct tree types other than `GreenNode`. Use this to construct `Expr` directly from `ParseStream` rather than constructing both GreenNode and SyntaxNode along the way. Fix a bunch of type instabilities in the `Expr` conversion code along the way. With these changes, parsing all of Base to `Expr` is sped up by about 35% overall and allocations reduced by around 50%. (Parsing to `Expr` is now comparable with parsing to `SyntaxNode`.)
1 parent 975cd1d commit 5bb321a

File tree

10 files changed

+665
-508
lines changed

10 files changed

+665
-508
lines changed

src/expr.jl

Lines changed: 314 additions & 215 deletions
Large diffs are not rendered by default.

src/green_tree.jl

Lines changed: 10 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -43,25 +43,10 @@ struct GreenNode{Head}
4343
args::Union{Tuple{},Vector{GreenNode{Head}}}
4444
end
4545

46-
function GreenNode{Head}(head::Head, span::Integer) where {Head}
47-
GreenNode{Head}(head, span, ())
46+
function GreenNode(head::Head, span::Integer, args) where {Head}
47+
GreenNode{Head}(head, span, args)
4848
end
4949

50-
function GreenNode(head::Head, span::Integer) where {Head}
51-
GreenNode{Head}(head, span, ())
52-
end
53-
54-
function GreenNode(head::Head, args) where {Head}
55-
children = collect(GreenNode{Head}, args)
56-
span = isempty(children) ? 0 : sum(x.span for x in children)
57-
GreenNode{Head}(head, span, children)
58-
end
59-
60-
function GreenNode(head::Head, args::GreenNode{Head}...) where {Head}
61-
GreenNode{Head}(head, GreenNode{Head}[args...])
62-
end
63-
64-
6550
# Accessors / predicates
6651
haschildren(node::GreenNode) = !(node.args isa Tuple{})
6752
children(node::GreenNode) = node.args
@@ -115,3 +100,11 @@ function Base.show(io::IO, ::MIME"text/plain", node::GreenNode, str::AbstractStr
115100
_show_green_node(io, node, "", 1, str, show_trivia)
116101
end
117102

103+
function build_tree(::Type{GreenNode}, stream::ParseStream; kws...)
104+
build_tree(GreenNode{SyntaxHead}, stream; kws...) do h, srcrange, cs
105+
span = length(srcrange)
106+
isnothing(cs) ? GreenNode(h, span, ()) :
107+
GreenNode(h, span, collect(GreenNode{SyntaxHead}, cs))
108+
end
109+
end
110+

src/hooks.jl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -195,7 +195,7 @@ function _core_parser_hook(code, filename::String, lineno::Int, offset::Int, opt
195195
#
196196
ex = build_tree(Expr, stream; filename=filename,
197197
wrap_toplevel_as_kind=K"None", first_line=lineno)
198-
if Meta.isexpr(ex, :None)
198+
if @isexpr(ex, :None)
199199
# The None wrapping is only to give somewhere for trivia to be
200200
# attached; unwrap!
201201
ex = only(ex.args)
@@ -286,7 +286,7 @@ function _fl_parse_hook(code, filename, lineno, offset, options)
286286
else
287287
if options === :all
288288
ex = Base.parse_input_line(String(code), filename=filename, depwarn=false)
289-
if !Meta.isexpr(ex, :toplevel)
289+
if !@isexpr(ex, :toplevel)
290290
ex = Expr(:toplevel, ex)
291291
end
292292
return ex, sizeof(code)

src/literal_parsing.jl

Lines changed: 81 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ function parse_int_literal(str::AbstractString)
1212
end
1313
if isnothing(x)
1414
x = Base.tryparse(Int128, str)
15-
if isnothing(x)
15+
if x === nothing
1616
x = Base.parse(BigInt, str)
1717
end
1818
end
@@ -358,3 +358,83 @@ function normalize_identifier(str)
358358
flags = Base.Unicode.UTF8PROC_STABLE | Base.Unicode.UTF8PROC_COMPOSE
359359
return isascii(str) ? str : utf8proc_map(str, flags)
360360
end
361+
362+
363+
#-------------------------------------------------------------------------------
364+
function parse_julia_literal(source, head::SyntaxHead, srcrange)
365+
# Leaf node
366+
k = kind(head)
367+
val_str = view(source, srcrange)
368+
# Any errors parsing literals are represented as ErrorVal() - this can
369+
# happen when the user sets `ignore_errors=true` during parsing.
370+
val = if k == K"Integer"
371+
parse_int_literal(val_str)
372+
elseif k == K"Float"
373+
v, code = parse_float_literal(Float64, source.code, first(srcrange),
374+
last(srcrange)+1)
375+
(code === :ok || code === :underflow) ? v : ErrorVal()
376+
elseif k == K"Float32"
377+
v, code = parse_float_literal(Float32, source.code, first(srcrange),
378+
last(srcrange)+1)
379+
(code === :ok || code === :underflow) ? v : ErrorVal()
380+
elseif k in KSet"BinInt OctInt HexInt"
381+
parse_uint_literal(val_str, k)
382+
elseif k == K"true"
383+
true
384+
elseif k == K"false"
385+
false
386+
elseif k == K"Char"
387+
io = IOBuffer()
388+
had_error = unescape_julia_string(io, source.code, first(srcrange),
389+
last(srcrange)+1, Diagnostic[])
390+
if had_error
391+
ErrorVal()
392+
else
393+
seek(io, 0)
394+
c = read(io, Char)
395+
eof(io) ? c : ErrorVal()
396+
end
397+
elseif k == K"Identifier"
398+
if has_flags(head, RAW_STRING_FLAG)
399+
io = IOBuffer()
400+
unescape_raw_string(io, val_str, false)
401+
Symbol(normalize_identifier(String(take!(io))))
402+
else
403+
Symbol(normalize_identifier(val_str))
404+
end
405+
elseif is_keyword(k)
406+
# This should only happen for tokens nested inside errors
407+
Symbol(val_str)
408+
elseif k in KSet"String CmdString"
409+
io = IOBuffer()
410+
had_error = false
411+
if has_flags(head, RAW_STRING_FLAG)
412+
unescape_raw_string(io, val_str, k == K"CmdString")
413+
else
414+
had_error = unescape_julia_string(io, source.code, first(srcrange),
415+
last(srcrange)+1, Diagnostic[])
416+
end
417+
had_error ? ErrorVal() : String(take!(io))
418+
elseif is_operator(k)
419+
isempty(srcrange) ?
420+
Symbol(untokenize(k)) : # synthetic invisible tokens
421+
Symbol(normalize_identifier(val_str))
422+
elseif k == K"error"
423+
ErrorVal()
424+
elseif k == K"MacroName"
425+
Symbol("@$(normalize_identifier(val_str))")
426+
elseif k == K"StringMacroName"
427+
Symbol("@$(normalize_identifier(val_str))_str")
428+
elseif k == K"CmdMacroName"
429+
Symbol("@$(normalize_identifier(val_str))_cmd")
430+
elseif k == K"core_@cmd"
431+
Symbol("core_@cmd")
432+
elseif is_syntax_kind(head)
433+
nothing
434+
else
435+
# FIXME: this allows us to recover from trivia is_error nodes
436+
# that we insert below
437+
ErrorVal()
438+
end
439+
end
440+

src/parse_stream.jl

Lines changed: 29 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -341,10 +341,6 @@ function token_last_byte(stream::ParseStream, i)
341341
stream.tokens[i].next_byte - 1
342342
end
343343

344-
function token_span(stream::ParseStream, i)
345-
stream.tokens[i].next_byte - stream.tokens[i-1].next_byte
346-
end
347-
348344
function lookahead_token_first_byte(stream, i)
349345
i == 1 ? _next_byte(stream) : stream.lookahead[i-1].next_byte
350346
end
@@ -961,24 +957,25 @@ end
961957
# API for extracting results from ParseStream
962958

963959
"""
964-
build_tree(::Type{NodeType}, stream::ParseStream;
960+
build_tree(make_node::Function, ::Type{StackEntry}, stream::ParseStream;
965961
wrap_toplevel_as_kind=nothing, kws...)
966962
967-
Construct a tree with `NodeType` nodes from a ParseStream using depth-first
968-
traversal. `NodeType` must have the constructors
963+
Construct a tree from a ParseStream using depth-first traversal. `make_node`
964+
must have the signature
965+
966+
make_node(head::SyntaxHead, span::Integer, children)
969967
970-
NodeType(head::SyntaxHead, span::Integer)
971-
NodeType(head::SyntaxHead, span::Integer, children::Vector{NodeType})
968+
where `children` is either `nothing` for leaf nodes or an iterable of the
969+
children of type `StackEntry` for internal nodes. `StackEntry` may be a node
970+
type, but also may include other information required during building the tree.
972971
973972
A single node which covers the input is expected, but if the ParseStream has
974973
multiple nodes at the top level, `wrap_toplevel_as_kind` may be used to wrap
975974
them in a single node.
976975
977-
The tree here is constructed depth-first, but it would also be possible to use
978-
a bottom-up tree builder interface similar to rust-analyzer. (In that case we'd
979-
traverse the list of ranges backward rather than forward.)
976+
The tree here is constructed depth-first in postorder.
980977
"""
981-
function build_tree(::Type{NodeType}, stream::ParseStream;
978+
function build_tree(make_node::Function, ::Type{NodeType}, stream::ParseStream;
982979
wrap_toplevel_as_kind=nothing, kws...) where NodeType
983980
stack = Vector{NamedTuple{(:first_token,:node),Tuple{Int,NodeType}}}()
984981

@@ -996,8 +993,15 @@ function build_tree(::Type{NodeType}, stream::ParseStream;
996993
i += 1
997994
continue # Ignore removed tokens
998995
end
999-
node = NodeType(head(t), token_span(stream, i))
1000-
push!(stack, (first_token=i, node=node))
996+
srcrange = (stream.tokens[i-1].next_byte:
997+
stream.tokens[i].next_byte - 1)
998+
h = head(t)
999+
children = (is_syntax_kind(h) || is_keyword(h)) ?
1000+
(stack[n].node for n=1:0) : nothing
1001+
node = make_node(h, srcrange, children)
1002+
if !isnothing(node)
1003+
push!(stack, (first_token=i, node=node))
1004+
end
10011005
i += 1
10021006
end
10031007
if j > lastindex(ranges)
@@ -1018,25 +1022,31 @@ function build_tree(::Type{NodeType}, stream::ParseStream;
10181022
while k > 1 && r.first_token <= stack[k-1].first_token
10191023
k -= 1
10201024
end
1025+
srcrange = (stream.tokens[r.first_token-1].next_byte:
1026+
stream.tokens[r.last_token].next_byte - 1)
10211027
children = (stack[n].node for n = k:length(stack))
1022-
node = NodeType(head(r), children)
1028+
node = make_node(head(r), srcrange, children)
10231029
resize!(stack, k-1)
1024-
push!(stack, (first_token=r.first_token, node=node))
1030+
if !isnothing(node)
1031+
push!(stack, (first_token=r.first_token, node=node))
1032+
end
10251033
j += 1
10261034
end
10271035
end
10281036
if length(stack) == 1
10291037
return only(stack).node
10301038
elseif !isnothing(wrap_toplevel_as_kind)
10311039
# Mostly for debugging
1040+
srcrange = (stream.tokens[1].next_byte:
1041+
stream.tokens[end].next_byte - 1)
10321042
children = (x.node for x in stack)
1033-
return NodeType(SyntaxHead(wrap_toplevel_as_kind, EMPTY_FLAGS), children)
1043+
return make_node(SyntaxHead(wrap_toplevel_as_kind, EMPTY_FLAGS),
1044+
srcrange, children)
10341045
else
10351046
error("Found multiple nodes at top level")
10361047
end
10371048
end
10381049

1039-
10401050
"""
10411051
sourcetext(stream::ParseStream; steal_textbuf=true)
10421052

src/precompile.jl

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# Just parse some file as a precompile workload
2-
let filename = joinpath(@__DIR__, "literal_parsing.jl")
3-
text = read(filename, String)
4-
parseall(Expr, text)
5-
end
2+
# let filename = joinpath(@__DIR__, "literal_parsing.jl")
3+
# text = read(filename, String)
4+
# parseall(Expr, text)
5+
# end

src/syntax_tree.jl

Lines changed: 1 addition & 77 deletions
Original file line numberDiff line numberDiff line change
@@ -46,85 +46,9 @@ Base.show(io::IO, ::ErrorVal) = printstyled(io, "✘", color=:light_red)
4646

4747
function SyntaxNode(source::SourceFile, raw::GreenNode{SyntaxHead}, position::Integer=1)
4848
if !haschildren(raw) && !(is_syntax_kind(raw) || is_keyword(raw))
49-
# Leaf node
50-
k = kind(raw)
51-
val_range = position:position + span(raw) - 1
52-
val_str = view(source, val_range)
5349
# Here we parse the values eagerly rather than representing them as
5450
# strings. Maybe this is good. Maybe not.
55-
#
56-
# Any errors parsing literals are represented as ErrorVal() - this can
57-
# happen when the user sets `ignore_errors=true` during parsing.
58-
val = if k == K"Integer"
59-
parse_int_literal(val_str)
60-
elseif k == K"Float"
61-
v, code = parse_float_literal(Float64, source.code, position,
62-
position+span(raw))
63-
(code === :ok || code === :underflow) ? v : ErrorVal()
64-
elseif k == K"Float32"
65-
v, code = parse_float_literal(Float32, source.code, position,
66-
position+span(raw))
67-
(code === :ok || code === :underflow) ? v : ErrorVal()
68-
elseif k in KSet"BinInt OctInt HexInt"
69-
parse_uint_literal(val_str, k)
70-
elseif k == K"true"
71-
true
72-
elseif k == K"false"
73-
false
74-
elseif k == K"Char"
75-
io = IOBuffer()
76-
had_error = unescape_julia_string(io, source.code, position,
77-
position+span(raw), Diagnostic[])
78-
if had_error
79-
ErrorVal()
80-
else
81-
seek(io, 0)
82-
c = read(io, Char)
83-
eof(io) ? c : ErrorVal()
84-
end
85-
elseif k == K"Identifier"
86-
if has_flags(head(raw), RAW_STRING_FLAG)
87-
io = IOBuffer()
88-
unescape_raw_string(io, val_str, false)
89-
Symbol(normalize_identifier(String(take!(io))))
90-
else
91-
Symbol(normalize_identifier(val_str))
92-
end
93-
elseif is_keyword(k)
94-
# This should only happen for tokens nested inside errors
95-
Symbol(val_str)
96-
elseif k in KSet"String CmdString"
97-
io = IOBuffer()
98-
had_error = false
99-
if has_flags(head(raw), RAW_STRING_FLAG)
100-
unescape_raw_string(io, val_str, k == K"CmdString")
101-
else
102-
had_error = unescape_julia_string(io, source.code, position,
103-
position+span(raw), Diagnostic[])
104-
end
105-
had_error ? ErrorVal() : String(take!(io))
106-
elseif is_operator(k)
107-
isempty(val_range) ?
108-
Symbol(untokenize(k)) : # synthetic invisible tokens
109-
Symbol(normalize_identifier(val_str))
110-
elseif k == K"error"
111-
ErrorVal()
112-
elseif k == K"MacroName"
113-
Symbol("@$(normalize_identifier(val_str))")
114-
elseif k == K"StringMacroName"
115-
Symbol("@$(normalize_identifier(val_str))_str")
116-
elseif k == K"CmdMacroName"
117-
Symbol("@$(normalize_identifier(val_str))_cmd")
118-
elseif k == K"core_@cmd"
119-
Symbol("core_@cmd")
120-
elseif is_syntax_kind(raw)
121-
nothing
122-
else
123-
# FIXME: this allows us to recover from trivia is_error nodes
124-
# that we insert below
125-
@debug "Leaf node of kind $k unknown to SyntaxNode"
126-
ErrorVal()
127-
end
51+
val = parse_julia_literal(source, head(raw), position:position + span(raw) - 1)
12852
return SyntaxNode(nothing, nothing, SyntaxData(source, raw, position, val))
12953
else
13054
cs = SyntaxNode[]

test/benchmark.jl

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,9 @@ end
1616
all_base_code = concat_base()
1717

1818
b_ParseStream = @benchmark JuliaSyntax.parse!(JuliaSyntax.ParseStream(all_base_code), rule=:all)
19-
b_GreenNode = @benchmark JuliaSyntax.parseall(JuliaSyntax.GreenNode, all_base_code)
20-
b_SyntaxNode = @benchmark JuliaSyntax.parseall(JuliaSyntax.SyntaxNode, all_base_code)
21-
b_Expr = @benchmark JuliaSyntax.parseall(Expr, all_base_code)
19+
b_GreenNode = @benchmark JuliaSyntax.parseall(JuliaSyntax.GreenNode, all_base_code, ignore_warnings=true)
20+
b_SyntaxNode = @benchmark JuliaSyntax.parseall(JuliaSyntax.SyntaxNode, all_base_code, ignore_warnings=true)
21+
b_Expr = @benchmark JuliaSyntax.parseall(Expr, all_base_code, ignore_warnings=true)
2222

2323
@info "Benchmarks" ParseStream=b_ParseStream GreenNode=b_GreenNode SyntaxNode=b_SyntaxNode Expr=b_Expr
2424

0 commit comments

Comments
 (0)