Skip to content

Commit b72226d

Browse files
committed
Graph utils: unalias_nodes, annotate_parent, prune
1 parent 2e508a7 commit b72226d

File tree

2 files changed

+416
-0
lines changed

2 files changed

+416
-0
lines changed

src/syntax_graph.jl

Lines changed: 220 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -799,3 +799,223 @@ end
799799
# out
800800
# end
801801

802+
#-------------------------------------------------------------------------------
803+
# Data structure utilities
804+
805+
"""
806+
unalias_nodes(st::SyntaxTree)
807+
808+
Return a tree where each descendent of `st` has exactly one parent in `st`. The
809+
returned tree is identical to `st` in all but underlying representation, where
810+
every additional parent to a subtree generates a copy of that subtree. Apart
811+
from this, `unalias_nodes!` should not allocate new nodes unnecessarily.
812+
813+
unalias_nodes(sl::SyntaxList)
814+
815+
If a `SyntaxList` is given, every resulting tree will be unique with respect to
816+
each other as well as internally. A duplicate id will produce a copied tree.
817+
"""
818+
unalias_nodes(st::SyntaxTree) = SyntaxTree(
819+
syntax_graph(st),
820+
_unalias_nodes!(syntax_graph(st), st._id, Set{NodeId}(), Set{Int}()))
821+
822+
function unalias_nodes(sl::SyntaxList)
823+
seen = Set{NodeId}()
824+
seen_edges = Set{Int}()
825+
SyntaxList(syntax_graph(sl),
826+
map(id->_unalias_nodes!(syntax_graph(sl), id, seen, seen_edges), sl.ids))
827+
end
828+
829+
# Note that `seen_edges` is only needed for when edge ranges overlap, which is a
830+
# situation we don't produce yet.
831+
function _unalias_nodes!(graph::SyntaxGraph, id::NodeId, seen::Set{NodeId}, seen_edges::Set{Int})
832+
if id in seen
833+
# all nodes from `copy_ast` are fresh/unreferenced, and distinct from
834+
# each other, so no further recursion is needed.
835+
return copy_ast(graph, SyntaxTree(graph, id); copy_source=false)._id
836+
end
837+
if !isempty(intersect(seen_edges, graph.edge_ranges[id]))
838+
# someone is referencing our edges; run away so we can modify them
839+
next_edge = length(graph.edges) + 1
840+
append!(graph.edges, children(graph, id))
841+
graph.edge_ranges[id] = next_edge:lastindex(graph.edges)
842+
end
843+
union!(seen_edges, graph.edge_ranges[id])
844+
push!(seen, id)
845+
846+
for (c, i) in zip(children(graph, id), graph.edge_ranges[id])
847+
c2 = _unalias_nodes!(graph, c, seen, seen_edges)
848+
# the new child should be the same in every way to the old one, so
849+
# modify the edge instead of triggering copies with `mapchildren`
850+
c !== c2 && (graph.edges[i] = c2)
851+
end
852+
return id
853+
end
854+
855+
"""
856+
Give each descendent of `st` a `parent::NodeId` attribute.
857+
"""
858+
function annotate_parent!(st::SyntaxTree)
859+
g = unfreeze_attrs(syntax_graph(st))
860+
st = unalias_nodes(SyntaxTree(g, st._id))
861+
ensure_attributes!(g; parent=NodeId)
862+
mapchildren(t->_annotate_parent!(t, st._id), syntax_graph(st), st)
863+
end
864+
865+
function _annotate_parent!(st::SyntaxTree, pid::NodeId)
866+
setattr!(st; parent=pid)
867+
mapchildren(t->_annotate_parent!(t, st._id), syntax_graph(st), st)
868+
end
869+
870+
"""
871+
Return a tree where unreachable nodes (non-descendents of `st`) in its graph
872+
have been deleted, and where provenance data has been minimized.
873+
874+
If `keep` is not nothing, also consider descendents of it reachable. By
875+
default, `keep` is the final node(s) in the provenance chain of `st`. This
876+
means that, by default, we have expression provenance back to the original
877+
parsed nodes, but no lowering-internal provenance. In any case, we still retain
878+
byte (or, with old macros, LineNumberNode) provenance.
879+
880+
Provenance shrinkage: Green trees are omitted from SourceRefs. If node A
881+
references node B as its source and B is unreachable, A adopts the source of B.
882+
883+
No attributes are deleted (but that can be done separately). Possible TODO:
884+
Non-sparse attributes could be stored more compactly
885+
"""
886+
function prune(st::SyntaxTree; keep::Union{SyntaxTree, SyntaxList, Nothing}=flattened_provenance(st))
887+
entrypoints = NodeId[st._id]
888+
keep isa SyntaxList && append!(entrypoints, keep.ids)
889+
keep isa SyntaxTree && push!(entrypoints, keep._id)
890+
prune_impl(syntax_graph(st), unique(entrypoints))[1]
891+
end
892+
893+
function prune_a(graph1::SyntaxGraph, entrypoints::Vector{NodeId})
894+
@assert length(entrypoints) === length(unique(entrypoints))
895+
nodes1 = NodeId[entrypoints...] # note nodes encountered >once appear once
896+
map12 = Dict{NodeId, Int}()
897+
graph2 = ensure_attributes!(SyntaxGraph(); attrtypes(graph1)...)
898+
while length(graph2.edge_ranges) < length(nodes1)
899+
n2 = length(graph2.edge_ranges) + 1
900+
n1 = nodes1[n2]
901+
map12[n1] = n2
902+
push!(graph2.edge_ranges, is_leaf(graph1, n1) ?
903+
(0:-1) : (1:numchildren(graph1, n1)) .+ length(graph2.edges))
904+
for c1 in children(graph1, n1)
905+
if haskey(map12, c1)
906+
push!(graph2.edges, map12[c1])
907+
else
908+
push!(nodes1, c1)
909+
push!(graph2.edges, length(nodes1))
910+
end
911+
end
912+
end
913+
914+
for attr in attrnames(graph1)
915+
attr === :source && continue
916+
for (n2, n1) in enumerate(nodes1)
917+
if (begin
918+
attrval = get(graph1.attributes[attr], n1, nothing)
919+
!isnothing(attrval)
920+
end)
921+
graph2.attributes[attr][n2] = attrval
922+
end
923+
end
924+
end
925+
926+
resolved_sources = Dict{NodeId, SourceAttrType}() # graph1 => graph2
927+
function get_resolved!(id1::NodeId)
928+
res = get(resolved_sources, id1, nothing)
929+
if isnothing(res)
930+
src1 = graph1.source[id1]
931+
res = if haskey(map12, src1)
932+
map12[src1]
933+
elseif src1 isa NodeId
934+
get_resolved!(src1)
935+
elseif src1 isa Tuple
936+
map(get_resolved!, src1)
937+
elseif src1 isa SourceRef
938+
SourceRef(src1.file, src1.first_byte, src1.last_byte, nothing)
939+
else
940+
src1
941+
end
942+
resolved_sources[id1] = res
943+
end
944+
return res
945+
end
946+
947+
for (n2, n1) in enumerate(nodes1)
948+
graph2.source[n2] = get_resolved!(n1)
949+
end
950+
return SyntaxList(graph2, 1:length(entrypoints))
951+
end
952+
953+
# Experiment: What can we gain if we are allowed to assume nodes are unaliased?
954+
955+
# This undoes a small amount of space savings from the DAG representation, but
956+
# it allows us to (1) omit the whole `edges` array (TODO), and (2) make the
957+
# pruning algorithm simpler. The invariant we win is having `edge_ranges` be
958+
# one or more interleaved level-order traversals where every node's set of
959+
# children is contiguous, so its entries can refer to itself instead of `edges`.
960+
function prune_u(graph1_a::SyntaxGraph, entrypoints_a::Vector{NodeId})
961+
@assert length(entrypoints_a) === length(unique(entrypoints_a))
962+
unaliased = unalias_nodes(SyntaxList(graph1_a, entrypoints_a))
963+
(graph1, entrypoints) = (unaliased.graph, unaliased.ids)
964+
nodes1 = NodeId[entrypoints...] # Reachable subset of graph1
965+
map12 = Dict{NodeId, Int}() # graph1 => graph2 mapping
966+
graph2 = ensure_attributes!(SyntaxGraph(); attrtypes(graph1)...)
967+
while length(graph2.edge_ranges) < length(nodes1)
968+
n2 = length(graph2.edge_ranges) + 1
969+
n1 = nodes1[n2]
970+
map12[n1] = n2
971+
push!(graph2.edge_ranges, is_leaf(graph1, n1) ?
972+
(0:-1) : (1:numchildren(graph1, n1)) .+ length(nodes1))
973+
for c1 in children(graph1, n1)
974+
push!(nodes1, c1)
975+
end
976+
end
977+
graph2.edges = 1:length(nodes1) # our reward for unaliasing
978+
979+
for attr in attrnames(graph1)
980+
attr === :source && continue
981+
for (n2, n1) in enumerate(nodes1)
982+
if (begin
983+
attrval = get(graph1.attributes[attr], n1, nothing)
984+
!isnothing(attrval)
985+
end)
986+
graph2.attributes[attr][n2] = attrval
987+
end
988+
end
989+
end
990+
991+
# Prune provenance. Tricky due to dangling `.source` references.
992+
resolved_sources = Dict{NodeId, SourceAttrType}() # graph1 => graph2
993+
function get_resolved!(id1::NodeId)
994+
res = get(resolved_sources, id1, nothing)
995+
if isnothing(res)
996+
src1 = graph1.source[id1]
997+
res = if haskey(map12, src1)
998+
map12[src1]
999+
elseif src1 isa NodeId
1000+
get_resolved!(src1)
1001+
elseif src1 isa Tuple
1002+
map(get_resolved!, src1)
1003+
elseif src1 isa SourceRef
1004+
SourceRef(src1.file, src1.first_byte, src1.last_byte, nothing)
1005+
else
1006+
src1
1007+
end
1008+
resolved_sources[id1] = res
1009+
end
1010+
return res
1011+
end
1012+
1013+
for (n2, n1) in enumerate(nodes1)
1014+
graph2.source[n2] = get_resolved!(n1)
1015+
end
1016+
1017+
# The first n entries in nodes1 were our entrypoints, unique from unaliasing
1018+
return SyntaxList(graph2, 1:length(entrypoints))
1019+
end
1020+
1021+
const prune_impl = prune_u

0 commit comments

Comments
 (0)