Skip to content

Commit ef1749b

Browse files
committed
Graph utils: unalias_nodes, annotate_parent, prune
1 parent 1bfae9d commit ef1749b

File tree

2 files changed

+416
-0
lines changed

2 files changed

+416
-0
lines changed

src/syntax_graph.jl

Lines changed: 220 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -798,3 +798,223 @@ end
798798
# out
799799
# end
800800

801+
#-------------------------------------------------------------------------------
802+
# Data structure utilities
803+
804+
"""
805+
unalias_nodes(st::SyntaxTree)
806+
807+
Return a tree where each descendent of `st` has exactly one parent in `st`. The
808+
returned tree is identical to `st` in all but underlying representation, where
809+
every additional parent to a subtree generates a copy of that subtree. Apart
810+
from this, `unalias_nodes!` should not allocate new nodes unnecessarily.
811+
812+
unalias_nodes(sl::SyntaxList)
813+
814+
If a `SyntaxList` is given, every resulting tree will be unique with respect to
815+
each other as well as internally. A duplicate id will produce a copied tree.
816+
"""
817+
unalias_nodes(st::SyntaxTree) = SyntaxTree(
818+
syntax_graph(st),
819+
_unalias_nodes!(syntax_graph(st), st._id, Set{NodeId}(), Set{Int}()))
820+
821+
function unalias_nodes(sl::SyntaxList)
822+
seen = Set{NodeId}()
823+
seen_edges = Set{Int}()
824+
SyntaxList(syntax_graph(sl),
825+
map(id->_unalias_nodes!(syntax_graph(sl), id, seen, seen_edges), sl.ids))
826+
end
827+
828+
# Note that `seen_edges` is only needed for when edge ranges overlap, which is a
829+
# situation we don't produce yet.
830+
function _unalias_nodes!(graph::SyntaxGraph, id::NodeId, seen::Set{NodeId}, seen_edges::Set{Int})
831+
if id in seen
832+
# all nodes from `copy_ast` are fresh/unreferenced, and distinct from
833+
# each other, so no further recursion is needed.
834+
return copy_ast(graph, SyntaxTree(graph, id); copy_source=false)._id
835+
end
836+
if !isempty(intersect(seen_edges, graph.edge_ranges[id]))
837+
# someone is referencing our edges; run away so we can modify them
838+
next_edge = length(graph.edges) + 1
839+
append!(graph.edges, children(graph, id))
840+
graph.edge_ranges[id] = next_edge:lastindex(graph.edges)
841+
end
842+
union!(seen_edges, graph.edge_ranges[id])
843+
push!(seen, id)
844+
845+
for (c, i) in zip(children(graph, id), graph.edge_ranges[id])
846+
c2 = _unalias_nodes!(graph, c, seen, seen_edges)
847+
# the new child should be the same in every way to the old one, so
848+
# modify the edge instead of triggering copies with `mapchildren`
849+
c !== c2 && (graph.edges[i] = c2)
850+
end
851+
return id
852+
end
853+
854+
"""
855+
Give each descendent of `st` a `parent::NodeId` attribute.
856+
"""
857+
function annotate_parent!(st::SyntaxTree)
858+
g = unfreeze_attrs(syntax_graph(st))
859+
st = unalias_nodes(SyntaxTree(g, st._id))
860+
ensure_attributes!(g; parent=NodeId)
861+
mapchildren(t->_annotate_parent!(t, st._id), syntax_graph(st), st)
862+
end
863+
864+
function _annotate_parent!(st::SyntaxTree, pid::NodeId)
865+
setattr!(st; parent=pid)
866+
mapchildren(t->_annotate_parent!(t, st._id), syntax_graph(st), st)
867+
end
868+
869+
"""
870+
Return a tree where unreachable nodes (non-descendents of `st`) in its graph
871+
have been deleted, and where provenance data has been minimized.
872+
873+
If `keep` is not nothing, also consider descendents of it reachable. By
874+
default, `keep` is the final node(s) in the provenance chain of `st`. This
875+
means that, by default, we have expression provenance back to the original
876+
parsed nodes, but no lowering-internal provenance. In any case, we still retain
877+
byte (or, with old macros, LineNumberNode) provenance.
878+
879+
Provenance shrinkage: Green trees are omitted from SourceRefs. If node A
880+
references node B as its source and B is unreachable, A adopts the source of B.
881+
882+
No attributes are deleted (but that can be done separately). Possible TODO:
883+
Non-sparse attributes could be stored more compactly
884+
"""
885+
function prune(st::SyntaxTree; keep::Union{SyntaxTree, SyntaxList, Nothing}=flattened_provenance(st))
886+
entrypoints = NodeId[st._id]
887+
keep isa SyntaxList && append!(entrypoints, keep.ids)
888+
keep isa SyntaxTree && push!(entrypoints, keep._id)
889+
prune_impl(syntax_graph(st), unique(entrypoints))[1]
890+
end
891+
892+
function prune_a(graph1::SyntaxGraph, entrypoints::Vector{NodeId})
893+
@assert length(entrypoints) === length(unique(entrypoints))
894+
nodes1 = NodeId[entrypoints...] # note nodes encountered >once appear once
895+
map12 = Dict{NodeId, Int}()
896+
graph2 = ensure_attributes!(SyntaxGraph(); attrtypes(graph1)...)
897+
while length(graph2.edge_ranges) < length(nodes1)
898+
n2 = length(graph2.edge_ranges) + 1
899+
n1 = nodes1[n2]
900+
map12[n1] = n2
901+
push!(graph2.edge_ranges, is_leaf(graph1, n1) ?
902+
(0:-1) : (1:numchildren(graph1, n1)) .+ length(graph2.edges))
903+
for c1 in children(graph1, n1)
904+
if haskey(map12, c1)
905+
push!(graph2.edges, map12[c1])
906+
else
907+
push!(nodes1, c1)
908+
push!(graph2.edges, length(nodes1))
909+
end
910+
end
911+
end
912+
913+
for attr in attrnames(graph1)
914+
attr === :source && continue
915+
for (n2, n1) in enumerate(nodes1)
916+
if (begin
917+
attrval = get(graph1.attributes[attr], n1, nothing)
918+
!isnothing(attrval)
919+
end)
920+
graph2.attributes[attr][n2] = attrval
921+
end
922+
end
923+
end
924+
925+
resolved_sources = Dict{NodeId, SourceAttrType}() # graph1 => graph2
926+
function get_resolved!(id1::NodeId)
927+
res = get(resolved_sources, id1, nothing)
928+
if isnothing(res)
929+
src1 = graph1.source[id1]
930+
res = if haskey(map12, src1)
931+
map12[src1]
932+
elseif src1 isa NodeId
933+
get_resolved!(src1)
934+
elseif src1 isa Tuple
935+
map(get_resolved!, src1)
936+
elseif src1 isa SourceRef
937+
SourceRef(src1.file, src1.first_byte, src1.last_byte, nothing)
938+
else
939+
src1
940+
end
941+
resolved_sources[id1] = res
942+
end
943+
return res
944+
end
945+
946+
for (n2, n1) in enumerate(nodes1)
947+
graph2.source[n2] = get_resolved!(n1)
948+
end
949+
return SyntaxList(graph2, 1:length(entrypoints))
950+
end
951+
952+
# Experiment: What can we gain if we are allowed to assume nodes are unaliased?
953+
954+
# This undoes a small amount of space savings from the DAG representation, but
955+
# it allows us to (1) omit the whole `edges` array (TODO), and (2) make the
956+
# pruning algorithm simpler. The invariant we win is having `edge_ranges` be
957+
# one or more interleaved level-order traversals where every node's set of
958+
# children is contiguous, so its entries can refer to itself instead of `edges`.
959+
function prune_u(graph1_a::SyntaxGraph, entrypoints_a::Vector{NodeId})
960+
@assert length(entrypoints_a) === length(unique(entrypoints_a))
961+
unaliased = unalias_nodes(SyntaxList(graph1_a, entrypoints_a))
962+
(graph1, entrypoints) = (unaliased.graph, unaliased.ids)
963+
nodes1 = NodeId[entrypoints...] # Reachable subset of graph1
964+
map12 = Dict{NodeId, Int}() # graph1 => graph2 mapping
965+
graph2 = ensure_attributes!(SyntaxGraph(); attrtypes(graph1)...)
966+
while length(graph2.edge_ranges) < length(nodes1)
967+
n2 = length(graph2.edge_ranges) + 1
968+
n1 = nodes1[n2]
969+
map12[n1] = n2
970+
push!(graph2.edge_ranges, is_leaf(graph1, n1) ?
971+
(0:-1) : (1:numchildren(graph1, n1)) .+ length(nodes1))
972+
for c1 in children(graph1, n1)
973+
push!(nodes1, c1)
974+
end
975+
end
976+
graph2.edges = 1:length(nodes1) # our reward for unaliasing
977+
978+
for attr in attrnames(graph1)
979+
attr === :source && continue
980+
for (n2, n1) in enumerate(nodes1)
981+
if (begin
982+
attrval = get(graph1.attributes[attr], n1, nothing)
983+
!isnothing(attrval)
984+
end)
985+
graph2.attributes[attr][n2] = attrval
986+
end
987+
end
988+
end
989+
990+
# Prune provenance. Tricky due to dangling `.source` references.
991+
resolved_sources = Dict{NodeId, SourceAttrType}() # graph1 => graph2
992+
function get_resolved!(id1::NodeId)
993+
res = get(resolved_sources, id1, nothing)
994+
if isnothing(res)
995+
src1 = graph1.source[id1]
996+
res = if haskey(map12, src1)
997+
map12[src1]
998+
elseif src1 isa NodeId
999+
get_resolved!(src1)
1000+
elseif src1 isa Tuple
1001+
map(get_resolved!, src1)
1002+
elseif src1 isa SourceRef
1003+
SourceRef(src1.file, src1.first_byte, src1.last_byte, nothing)
1004+
else
1005+
src1
1006+
end
1007+
resolved_sources[id1] = res
1008+
end
1009+
return res
1010+
end
1011+
1012+
for (n2, n1) in enumerate(nodes1)
1013+
graph2.source[n2] = get_resolved!(n1)
1014+
end
1015+
1016+
# The first n entries in nodes1 were our entrypoints, unique from unaliasing
1017+
return SyntaxList(graph2, 1:length(entrypoints))
1018+
end
1019+
1020+
const prune_impl = prune_u

0 commit comments

Comments
 (0)