diff --git a/src/syntax_graph.jl b/src/syntax_graph.jl index 767643b..8c6de6f 100644 --- a/src/syntax_graph.jl +++ b/src/syntax_graph.jl @@ -325,12 +325,17 @@ end struct SourceRef file::SourceFile first_byte::Int - # TODO: Do we need the green node, or would last_byte suffice? - green_tree::JuliaSyntax.GreenNode + last_byte::Int + # TODO: `nothing` is only used when we `prune` a tree, but we may not need + # this field at all. + green_tree::Union{Nothing, JuliaSyntax.GreenNode} end +SourceRef(file::SourceFile, first_byte::Int, green_tree::Union{Nothing, JuliaSyntax.GreenNode}) = + SourceRef(file, first_byte, first_byte + span(green_tree) - 1, green_tree) + JuliaSyntax.sourcefile(src::SourceRef) = src.file -JuliaSyntax.byte_range(src::SourceRef) = src.first_byte:(src.first_byte + span(src.green_tree) - 1) +JuliaSyntax.byte_range(src::SourceRef) = src.first_byte:src.last_byte # TODO: Adding these methods to support LineNumberNode is kind of hacky but we # can remove these after JuliaLowering becomes self-bootstrapping for macros @@ -817,3 +822,141 @@ end # end # out # end + +#------------------------------------------------------------------------------- +# Data structure utilities + +""" + unalias_nodes(st::SyntaxTree) + +Return a tree where each descendent of `st` has exactly one parent in `st`. The +returned tree is identical to `st` in all but underlying representation, where +every additional parent to a subtree generates a copy of that subtree. Apart +from this, `unalias_nodes` should not allocate new nodes unnecessarily. + + unalias_nodes(sl::SyntaxList) + +If a `SyntaxList` is given, every resulting tree will be unique with respect to +each other as well as internally. A duplicate entry will produce a copied tree. +""" +unalias_nodes(st::SyntaxTree) = SyntaxTree( + syntax_graph(st), + _unalias_nodes(syntax_graph(st), st._id, Set{NodeId}(), Set{Int}())) + +function unalias_nodes(sl::SyntaxList) + seen = Set{NodeId}() + seen_edges = Set{Int}() + SyntaxList(syntax_graph(sl), + map(id->_unalias_nodes(syntax_graph(sl), id, seen, seen_edges), sl.ids)) +end + +# Note that `seen_edges` is only needed for when edge ranges overlap, which is a +# situation we don't produce yet. +function _unalias_nodes(graph::SyntaxGraph, id::NodeId, seen::Set{NodeId}, seen_edges::Set{Int}) + if id in seen + id = copy_ast(graph, SyntaxTree(graph, id); copy_source=false)._id + end + if !isempty(intersect(seen_edges, graph.edge_ranges[id])) + # someone is referencing our edges; run away so we can modify them + next_edge = length(graph.edges) + 1 + append!(graph.edges, children(graph, id)) + graph.edge_ranges[id] = next_edge:lastindex(graph.edges) + end + union!(seen_edges, graph.edge_ranges[id]) + push!(seen, id) + + for (c, i) in zip(children(graph, id), graph.edge_ranges[id]) + c2 = _unalias_nodes(graph, c, seen, seen_edges) + # the new child should be the same in every way to the old one, so + # modify the edge instead of triggering copies with `mapchildren` + c !== c2 && (graph.edges[i] = c2) + end + return id +end + +""" +Return a tree where unreachable nodes (non-descendents of `st`) in its graph +have been deleted, and where provenance data has been minimized. + +If `keep` is not nothing, also consider descendents of it reachable. By +default, `keep` is the final node(s) in the provenance chain of `st`. This +means that, by default, we have expression provenance back to the original +parsed nodes, but no lowering-internal provenance. In any case, we still retain +byte (or, with old macros, LineNumberNode) provenance. + +Provenance shrinkage: Green trees are omitted from SourceRefs. If node A +references node B as its source and B is unreachable, A adopts the source of B. +""" +function prune(st::SyntaxTree; keep::Union{SyntaxTree, SyntaxList, Nothing}=flattened_provenance(st)) + entrypoints = NodeId[st._id] + keep isa SyntaxList && append!(entrypoints, keep.ids) + keep isa SyntaxTree && push!(entrypoints, keep._id) + prune(syntax_graph(st), unique(entrypoints))[1] +end + +# This implementation unaliases nodes, which undoes a small amount of space +# savings from the DAG representation, but it allows us to (1) omit the whole +# `edges` array (TODO), and (2) make the pruning algorithm simpler. The +# invariant we win is having `edge_ranges` be one or more interleaved +# level-order traversals where every node's set of children is contiguous, so +# its entries can refer to itself instead of an external `edges` vector. +function prune(graph1_a::SyntaxGraph, entrypoints_a::Vector{NodeId}) + @assert length(entrypoints_a) === length(unique(entrypoints_a)) + unaliased = unalias_nodes(SyntaxList(graph1_a, entrypoints_a)) + (graph1, entrypoints) = (unaliased.graph, unaliased.ids) + nodes1 = NodeId[entrypoints...] # Reachable subset of graph1 + map12 = Dict{NodeId, Int}() # graph1 => graph2 mapping + graph2 = ensure_attributes!(SyntaxGraph(); attrdefs(graph1)...) + while length(graph2.edge_ranges) < length(nodes1) + n2 = length(graph2.edge_ranges) + 1 + n1 = nodes1[n2] + map12[n1] = n2 + push!(graph2.edge_ranges, is_leaf(graph1, n1) ? + (0:-1) : (1:numchildren(graph1, n1)) .+ length(nodes1)) + for c1 in children(graph1, n1) + push!(nodes1, c1) + end + end + graph2.edges = 1:length(nodes1) # our reward for unaliasing + + for attr in attrnames(graph1) + attr === :source && continue + for (n2, n1) in enumerate(nodes1) + if (begin + attrval = get(graph1.attributes[attr], n1, nothing) + !isnothing(attrval) + end) + graph2.attributes[attr][n2] = attrval + end + end + end + + # Prune provenance. Tricky due to dangling `.source` references. + resolved_sources = Dict{NodeId, SourceAttrType}() # graph1 => graph2 + function get_resolved!(id1::NodeId) + res = get(resolved_sources, id1, nothing) + if isnothing(res) + src1 = graph1.source[id1] + res = if haskey(map12, src1) + map12[src1] + elseif src1 isa NodeId + get_resolved!(src1) + elseif src1 isa Tuple + map(get_resolved!, src1) + elseif src1 isa SourceRef + SourceRef(src1.file, src1.first_byte, src1.last_byte, nothing) + else + src1 + end + resolved_sources[id1] = res + end + return res + end + + for (n2, n1) in enumerate(nodes1) + graph2.source[n2] = get_resolved!(n1) + end + + # The first n entries in nodes1 were our entrypoints, unique from unaliasing + return SyntaxList(graph2, 1:length(entrypoints)) +end diff --git a/test/functions_ir.jl b/test/functions_ir.jl index a537757..2f9f6ab 100644 --- a/test/functions_ir.jl +++ b/test/functions_ir.jl @@ -1532,7 +1532,7 @@ end 18 (call core.svec %₁₅ %₁₆ %₁₇) 19 --- method core.nothing %₁₈ slots: [slot₁/#self#(!read) slot₂/x(!read) slot₃/y(!read)] - 1 (meta :generated (new JuliaLowering.GeneratedFunctionStub TestMod.#f_only_generated@generator#0 SourceRef(SourceFile("@generated function f_only_generated(x, y)\n generator_code(x,y)\nend", 0, nothing, 1, [1, 44, 68]), 1, (macrocall (macro_name 1-1::@-t 2-10::Identifier) 11-11::Whitespace-t (function 12-19::function-t 20-20::Whitespace-t (call 21-36::Identifier 37-37::(-t 38-38::Identifier 39-39::,-t 40-40::Whitespace-t 41-41::Identifier 42-42::)-t) (block 43-47::NewlineWs-t (call 48-61::Identifier 62-62::(-t 63-63::Identifier 64-64::,-t 65-65::Identifier 66-66::)-t) 67-67::NewlineWs-t) 68-70::end-t))) (call core.svec :#self# :x :y) (call core.svec))) + 1 (meta :generated (new JuliaLowering.GeneratedFunctionStub TestMod.#f_only_generated@generator#0 SourceRef(SourceFile("@generated function f_only_generated(x, y)\n generator_code(x,y)\nend", 0, nothing, 1, [1, 44, 68]), 1, 70, (macrocall (macro_name 1-1::@-t 2-10::Identifier) 11-11::Whitespace-t (function 12-19::function-t 20-20::Whitespace-t (call 21-36::Identifier 37-37::(-t 38-38::Identifier 39-39::,-t 40-40::Whitespace-t 41-41::Identifier 42-42::)-t) (block 43-47::NewlineWs-t (call 48-61::Identifier 62-62::(-t 63-63::Identifier 64-64::,-t 65-65::Identifier 66-66::)-t) 67-67::NewlineWs-t) 68-70::end-t))) (call core.svec :#self# :x :y) (call core.svec))) 2 (meta :generated_only) 3 (return core.nothing) 20 latestworld @@ -1578,7 +1578,7 @@ end 18 (call core.svec %₁₅ %₁₆ %₁₇) 19 --- method core.nothing %₁₈ slots: [slot₁/#self#(!read) slot₂/x slot₃/y slot₄/maybe_gen_stuff slot₅/nongen_stuff] - 1 (meta :generated (new JuliaLowering.GeneratedFunctionStub TestMod.#f_partially_generated@generator#0 SourceRef(SourceFile("function f_partially_generated(x, y)\n nongen_stuff = bothgen(x, y)\n if @generated\n quote\n maybe_gen_stuff = some_gen_stuff(x, y)\n end\n else\n maybe_gen_stuff = some_nongen_stuff(x, y)\n end\n (nongen_stuff, maybe_gen_stuff)\nend", 0, nothing, 1, [1, 38, 71, 89, 103, 154, 166, 175, 225, 233, 269]), 1, (function 1-8::function-t 9-9::Whitespace-t (call 10-30::Identifier 31-31::(-t 32-32::Identifier 33-33::,-t 34-34::Whitespace-t 35-35::Identifier 36-36::)-t) (block 37-41::NewlineWs-t (= 42-53::Identifier 54-54::Whitespace-t 55-55::=-t 56-56::Whitespace-t (call 57-63::Identifier 64-64::(-t 65-65::Identifier 66-66::,-t 67-67::Whitespace-t 68-68::Identifier 69-69::)-t)) 70-74::NewlineWs-t (if 75-76::if-t 77-77::Whitespace-t (macrocall (macro_name 78-78::@-t 79-87::Identifier)) (block 88-96::NewlineWs-t (quote (block 97-101::quote-t 102-114::NewlineWs-t (= 115-129::Identifier 130-130::Whitespace-t 131-131::=-t 132-132::Whitespace-t (call 133-146::Identifier 147-147::(-t 148-148::Identifier 149-149::,-t 150-150::Whitespace-t 151-151::Identifier 152-152::)-t)) 153-161::NewlineWs-t 162-164::end-t)) 165-169::NewlineWs-t) 170-173::else-t (block 174-182::NewlineWs-t (= 183-197::Identifier 198-198::Whitespace-t 199-199::=-t 200-200::Whitespace-t (call 201-217::Identifier 218-218::(-t 219-219::Identifier 220-220::,-t 221-221::Whitespace-t 222-222::Identifier 223-223::)-t)) 224-228::NewlineWs-t) 229-231::end-t) 232-236::NewlineWs-t (tuple-p 237-237::(-t 238-249::Identifier 250-250::,-t 251-251::Whitespace-t 252-266::Identifier 267-267::)-t) 268-268::NewlineWs-t) 269-271::end-t)) (call core.svec :#self# :x :y) (call core.svec))) + 1 (meta :generated (new JuliaLowering.GeneratedFunctionStub TestMod.#f_partially_generated@generator#0 SourceRef(SourceFile("function f_partially_generated(x, y)\n nongen_stuff = bothgen(x, y)\n if @generated\n quote\n maybe_gen_stuff = some_gen_stuff(x, y)\n end\n else\n maybe_gen_stuff = some_nongen_stuff(x, y)\n end\n (nongen_stuff, maybe_gen_stuff)\nend", 0, nothing, 1, [1, 38, 71, 89, 103, 154, 166, 175, 225, 233, 269]), 1, 271, (function 1-8::function-t 9-9::Whitespace-t (call 10-30::Identifier 31-31::(-t 32-32::Identifier 33-33::,-t 34-34::Whitespace-t 35-35::Identifier 36-36::)-t) (block 37-41::NewlineWs-t (= 42-53::Identifier 54-54::Whitespace-t 55-55::=-t 56-56::Whitespace-t (call 57-63::Identifier 64-64::(-t 65-65::Identifier 66-66::,-t 67-67::Whitespace-t 68-68::Identifier 69-69::)-t)) 70-74::NewlineWs-t (if 75-76::if-t 77-77::Whitespace-t (macrocall (macro_name 78-78::@-t 79-87::Identifier)) (block 88-96::NewlineWs-t (quote (block 97-101::quote-t 102-114::NewlineWs-t (= 115-129::Identifier 130-130::Whitespace-t 131-131::=-t 132-132::Whitespace-t (call 133-146::Identifier 147-147::(-t 148-148::Identifier 149-149::,-t 150-150::Whitespace-t 151-151::Identifier 152-152::)-t)) 153-161::NewlineWs-t 162-164::end-t)) 165-169::NewlineWs-t) 170-173::else-t (block 174-182::NewlineWs-t (= 183-197::Identifier 198-198::Whitespace-t 199-199::=-t 200-200::Whitespace-t (call 201-217::Identifier 218-218::(-t 219-219::Identifier 220-220::,-t 221-221::Whitespace-t 222-222::Identifier 223-223::)-t)) 224-228::NewlineWs-t) 229-231::end-t) 232-236::NewlineWs-t (tuple-p 237-237::(-t 238-249::Identifier 250-250::,-t 251-251::Whitespace-t 252-266::Identifier 267-267::)-t) 268-268::NewlineWs-t) 269-271::end-t)) (call core.svec :#self# :x :y) (call core.svec))) 2 TestMod.bothgen 3 (= slot₅/nongen_stuff (call %₂ slot₂/x slot₃/y)) 4 TestMod.some_nongen_stuff diff --git a/test/syntax_graph.jl b/test/syntax_graph.jl index 60fd10d..75d3801 100644 --- a/test/syntax_graph.jl +++ b/test/syntax_graph.jl @@ -63,10 +63,12 @@ end function testgraph(edge_ranges, edges, more_attrs...) kinds = Dict(map(i->(i=>K"block"), eachindex(edge_ranges))) sources = Dict(map(i->(i=>LineNumberNode(i)), eachindex(edge_ranges))) + orig = Dict(map(i->(i=>i), eachindex(edge_ranges))) SyntaxGraph( edge_ranges, edges, - Dict(:kind => kinds, :source => sources, more_attrs...)) + Dict(:kind => kinds, :source => sources, + :orig => orig, more_attrs...)) end @testset "copy_ast" begin @@ -107,4 +109,175 @@ end # Disallow for now, since we can't prevent dangling sourcerefs @test_throws ErrorException JuliaLowering.copy_ast(new_g, st; copy_source=false) end + + @testset "unalias_nodes" begin + # 1 -+-> 2 ->-> 4 + # | | + # +-> 3 -+ + g = testgraph([1:2, 3:3, 4:4, 0:-1], [2, 3, 4, 4]) + st = SyntaxTree(g, 1) + stu = JuliaLowering.unalias_nodes(st) + @test st ≈ stu + @test length(stu._graph.edge_ranges) == 5 + @test length(stu._graph.edges) == 4 + # Properties of node 4 should be preserved + @test 4 == stu[1][1].orig == stu[2][1].orig + @test st[1][1].source == stu[1][1].source == stu[2][1].source + @test stu[1][1]._id != stu[2][1]._id + + # Try again with overlapping edge_ranges + g = testgraph([1:2, 3:3, 3:3, 0:-1], [2, 3, 4]) + st = SyntaxTree(g, 1) + stu = JuliaLowering.unalias_nodes(st) + @test st ≈ stu + @test length(stu._graph.edge_ranges) == 5 + @test length(stu._graph.edges) == 4 + @test 4 == stu[1][1].orig == stu[2][1].orig + @test st[1][1].source == stu[1][1].source == stu[2][1].source + @test stu[1][1]._id != stu[2][1]._id + + # +-> 5 + # | + # 1 -+-> 2 -+---->>>-> 6 + # | ||| + # +-> 3 -> 7 -+|| + # | || + # +-> 4 -+-----+| + # | | + # +------+ + g = testgraph([1:3, 4:5, 6:6, 7:8, 0:-1, 0:-1, 9:9], + [2, 3, 4, 5, 6, 7, 6, 6, 6]) + st = SyntaxTree(g, 1) + stu = JuliaLowering.unalias_nodes(st) + @test st ≈ stu + # node 6 should be copied three times + @test length(stu._graph.edge_ranges) == 10 + @test length(stu._graph.edges) == 9 + # the four copies of node 6 should have attrs identical to the original and distinct ids + @test 6 == stu[1][2].orig == stu[2][1][1].orig == stu[3][1].orig == stu[3][2].orig + @test stu[1][2]._id != stu[2][1][1]._id != stu[3][1]._id != stu[3][2]._id + + # 1 -+-> 2 ->-> 4 -+----> 5 ->-> 7 + # | | | | + # +-> 3 -+ +-->-> 6 -+ + # | | + # +------------+ + g = testgraph([1:2, 3:3, 4:5, 6:7, 8:8, 9:9, 0:-1], + [2,3,4,4,6,5,6,7,7]) + st = SyntaxTree(g, 1) + stu = JuliaLowering.unalias_nodes(st) + @test st ≈ stu + @test length(stu._graph.edge_ranges) == 15 + @test length(stu._graph.edges) == 14 + # attrs of nodes 4-7 + @test 4 == stu[1][1].orig == stu[2][1].orig + @test 5 == stu[1][1][1].orig == stu[2][1][1].orig + @test 6 == stu[1][1][2].orig == stu[2][1][2].orig == stu[2][2].orig + @test 7 == stu[1][1][1][1].orig == stu[1][1][2][1].orig == + stu[2][1][1][1].orig == stu[2][1][2][1].orig == stu[2][2][1].orig + # ensure no duplication + @test stu[1][1][1][1]._id != stu[1][1][2][1]._id != + stu[2][1][1][1]._id != stu[2][1][2][1]._id != stu[2][2][1]._id + end + + @testset "prune" begin + # [1]-+-> 2 5 --> 6 + # | + # +-> 3 --> 4 7 + g = testgraph([1:2, 0:-1, 3:3, 0:-1, 4:4, 0:-1, 0:-1], [2, 3, 4, 6]) + st = SyntaxTree(g, 1) + stp = JuliaLowering.prune(st) + @test st ≈ stp + @test length(syntax_graph(stp).edge_ranges) === 4 + @test stp.source == LineNumberNode(1) + @test stp[1].source == LineNumberNode(2) + @test stp[2].source == LineNumberNode(3) + @test stp[2][1].source == LineNumberNode(4) + + # (also checks that the last prune didn't destroy the graph) + # 1 -+-> 2 5 --> 6 + # | + # +-> 3 --> 4 [7] + st = SyntaxTree(g, 7) + stp = JuliaLowering.prune(st) + @test st ≈ stp + @test length(syntax_graph(stp).edge_ranges) === 1 + @test stp.orig == 7 + + # 1 -+->[2]->-> 4 + # | | + # +-> 3 -+ + g = testgraph([1:2, 3:3, 4:4, 0:-1], [2, 3, 4, 4]) + st = SyntaxTree(g, 2) + stp = JuliaLowering.prune(st) + @test st ≈ stp + @test length(syntax_graph(stp).edge_ranges) === 2 + @test stp.orig == 2 + @test stp[1].orig == 4 + + # 9 -->[1]--> 5 src(1) = 2 + # 10 --> 2 --> 6 src(2) = 3 + # 11 --> 3 --> 7 src(3) = 4 + # 12 --> 4 --> 8 else src(i) = line(i) + g = testgraph([1:1, 2:2, 3:3, 4:4, 0:-1, 0:-1, 0:-1, 0:-1, 5:5, 6:6, 7:7, 8:8], + [5, 6, 7, 8, 1, 2, 3, 4], + :source => Dict( + 1=>2, 2=>3, 3=>4, + map(i->(i=>LineNumberNode(i)), 4:12)...)) + st = SyntaxTree(g, 1) + stp = JuliaLowering.prune(st) + @test st ≈ stp + # 1, 5, 4, 8 should remain + @test length(syntax_graph(stp).edge_ranges) === 4 + @test stp.source isa NodeId + orig_4 = SyntaxTree(syntax_graph(stp), stp.source) + @test orig_4.source === LineNumberNode(4) + @test numchildren(orig_4) === 1 + @test orig_4[1].source === LineNumberNode(8) + @test stp[1].source === LineNumberNode(5) + + # Try again with node 3 explicitly marked reachable + stp = JuliaLowering.prune(st, keep=JuliaLowering.SyntaxList(g, NodeId[3, 4])) + @test st ≈ stp + # 1, 5, 4, 8, and now 3, 7 as well + @test length(syntax_graph(stp).edge_ranges) === 6 + @test stp.source isa NodeId + @test stp[1].source === LineNumberNode(5) + + orig_3 = SyntaxTree(syntax_graph(stp), stp.source) + @test orig_3.source isa NodeId + orig_4 = SyntaxTree(syntax_graph(stp), orig_3.source) + @test orig_4.source === LineNumberNode(4) + + @test numchildren(orig_3) === 1 + @test numchildren(orig_4) === 1 + @test orig_3[1].source === LineNumberNode(7) + @test orig_4[1].source === LineNumberNode(8) + + # Try again with no node provenance + stp = JuliaLowering.prune(st, keep=nothing) + @test st ≈ stp + @test length(syntax_graph(stp).edge_ranges) === 2 + @test stp.source === LineNumberNode(4) + @test stp[1].source === LineNumberNode(5) + + # "real world" test with lowered output---not many properties we can + # check without fragile tests, but there are some. + test_mod = Module() + code = "begin; x1=1; x2=2; x3=3; x4=begin; 4; end; begin; end; end" + st0 = parsestmt(SyntaxTree, code) + st5 = JuliaLowering.lower(test_mod, st0) + stp = JuliaLowering.prune(st5) + @test st5 ≈ stp + @test length(syntax_graph(stp).edge_ranges) < length(syntax_graph(st5).edge_ranges) + @test stp.source isa NodeId + @test SyntaxTree(syntax_graph(stp), stp.source) ≈ st0 + @test sourcetext(stp) == code + # try without preserving st0 + stp = JuliaLowering.prune(st5, keep=nothing) + @test st5 ≈ stp + @test length(syntax_graph(stp).edge_ranges) < length(syntax_graph(st5).edge_ranges) + @test stp.source isa SourceRef + @test sourcetext(stp) == code + end end