@@ -798,3 +798,223 @@ end
798798# out
799799# end
800800
801+ # -------------------------------------------------------------------------------
802+ # Data structure utilities
803+
804+ """
805+ unalias_nodes(st::SyntaxTree)
806+
807+ Return a tree where each descendent of `st` has exactly one parent in `st`. The
808+ returned tree is identical to `st` in all but underlying representation, where
809+ every additional parent to a subtree generates a copy of that subtree. Apart
810+ from this, `unalias_nodes!` should not allocate new nodes unnecessarily.
811+
812+ unalias_nodes(sl::SyntaxList)
813+
814+ If a `SyntaxList` is given, every resulting tree will be unique with respect to
815+ each other as well as internally. A duplicate id will produce a copied tree.
816+ """
817+ unalias_nodes (st:: SyntaxTree ) = SyntaxTree (
818+ syntax_graph (st),
819+ _unalias_nodes! (syntax_graph (st), st. _id, Set {NodeId} (), Set {Int} ()))
820+
821+ function unalias_nodes (sl:: SyntaxList )
822+ seen = Set {NodeId} ()
823+ seen_edges = Set {Int} ()
824+ SyntaxList (syntax_graph (sl),
825+ map (id-> _unalias_nodes! (syntax_graph (sl), id, seen, seen_edges), sl. ids))
826+ end
827+
828+ # Note that `seen_edges` is only needed for when edge ranges overlap, which is a
829+ # situation we don't produce yet.
830+ function _unalias_nodes! (graph:: SyntaxGraph , id:: NodeId , seen:: Set{NodeId} , seen_edges:: Set{Int} )
831+ if id in seen
832+ # all nodes from `copy_ast` are fresh/unreferenced, and distinct from
833+ # each other, so no further recursion is needed.
834+ return copy_ast (graph, SyntaxTree (graph, id); copy_source= false ). _id
835+ end
836+ if ! isempty (intersect (seen_edges, graph. edge_ranges[id]))
837+ # someone is referencing our edges; run away so we can modify them
838+ next_edge = length (graph. edges) + 1
839+ append! (graph. edges, children (graph, id))
840+ graph. edge_ranges[id] = next_edge: lastindex (graph. edges)
841+ end
842+ union! (seen_edges, graph. edge_ranges[id])
843+ push! (seen, id)
844+
845+ for (c, i) in zip (children (graph, id), graph. edge_ranges[id])
846+ c2 = _unalias_nodes! (graph, c, seen, seen_edges)
847+ # the new child should be the same in every way to the old one, so
848+ # modify the edge instead of triggering copies with `mapchildren`
849+ c != = c2 && (graph. edges[i] = c2)
850+ end
851+ return id
852+ end
853+
854+ """
855+ Give each descendent of `st` a `parent::NodeId` attribute.
856+ """
857+ function annotate_parent! (st:: SyntaxTree )
858+ g = unfreeze_attrs (syntax_graph (st))
859+ st = unalias_nodes (SyntaxTree (g, st. _id))
860+ ensure_attributes! (g; parent= NodeId)
861+ mapchildren (t-> _annotate_parent! (t, st. _id), syntax_graph (st), st)
862+ end
863+
864+ function _annotate_parent! (st:: SyntaxTree , pid:: NodeId )
865+ setattr! (st; parent= pid)
866+ mapchildren (t-> _annotate_parent! (t, st. _id), syntax_graph (st), st)
867+ end
868+
869+ """
870+ Return a tree where unreachable nodes (non-descendents of `st`) in its graph
871+ have been deleted, and where provenance data has been minimized.
872+
873+ If `keep` is not nothing, also consider descendents of it reachable. By
874+ default, `keep` is the final node(s) in the provenance chain of `st`. This
875+ means that, by default, we have expression provenance back to the original
876+ parsed nodes, but no lowering-internal provenance. In any case, we still retain
877+ byte (or, with old macros, LineNumberNode) provenance.
878+
879+ Provenance shrinkage: Green trees are omitted from SourceRefs. If node A
880+ references node B as its source and B is unreachable, A adopts the source of B.
881+
882+ No attributes are deleted (but that can be done separately). Possible TODO:
883+ Non-sparse attributes could be stored more compactly
884+ """
885+ function prune (st:: SyntaxTree ; keep:: Union{SyntaxTree, SyntaxList, Nothing} = flattened_provenance (st))
886+ entrypoints = NodeId[st. _id]
887+ keep isa SyntaxList && append! (entrypoints, keep. ids)
888+ keep isa SyntaxTree && push! (entrypoints, keep. _id)
889+ prune_impl (syntax_graph (st), unique (entrypoints))[1 ]
890+ end
891+
892+ function prune_a (graph1:: SyntaxGraph , entrypoints:: Vector{NodeId} )
893+ @assert length (entrypoints) === length (unique (entrypoints))
894+ nodes1 = NodeId[entrypoints... ] # note nodes encountered >once appear once
895+ map12 = Dict {NodeId, Int} ()
896+ graph2 = ensure_attributes! (SyntaxGraph (); attrtypes (graph1)... )
897+ while length (graph2. edge_ranges) < length (nodes1)
898+ n2 = length (graph2. edge_ranges) + 1
899+ n1 = nodes1[n2]
900+ map12[n1] = n2
901+ push! (graph2. edge_ranges, is_leaf (graph1, n1) ?
902+ (0 : - 1 ) : (1 : numchildren (graph1, n1)) .+ length (graph2. edges))
903+ for c1 in children (graph1, n1)
904+ if haskey (map12, c1)
905+ push! (graph2. edges, map12[c1])
906+ else
907+ push! (nodes1, c1)
908+ push! (graph2. edges, length (nodes1))
909+ end
910+ end
911+ end
912+
913+ for attr in attrnames (graph1)
914+ attr === :source && continue
915+ for (n2, n1) in enumerate (nodes1)
916+ if (begin
917+ attrval = get (graph1. attributes[attr], n1, nothing )
918+ ! isnothing (attrval)
919+ end )
920+ graph2. attributes[attr][n2] = attrval
921+ end
922+ end
923+ end
924+
925+ resolved_sources = Dict {NodeId, SourceAttrType} () # graph1 => graph2
926+ function get_resolved! (id1:: NodeId )
927+ res = get (resolved_sources, id1, nothing )
928+ if isnothing (res)
929+ src1 = graph1. source[id1]
930+ res = if haskey (map12, src1)
931+ map12[src1]
932+ elseif src1 isa NodeId
933+ get_resolved! (src1)
934+ elseif src1 isa Tuple
935+ map (get_resolved!, src1)
936+ elseif src1 isa SourceRef
937+ SourceRef (src1. file, src1. first_byte, src1. last_byte, nothing )
938+ else
939+ src1
940+ end
941+ resolved_sources[id1] = res
942+ end
943+ return res
944+ end
945+
946+ for (n2, n1) in enumerate (nodes1)
947+ graph2. source[n2] = get_resolved! (n1)
948+ end
949+ return SyntaxList (graph2, 1 : length (entrypoints))
950+ end
951+
952+ # Experiment: What can we gain if we are allowed to assume nodes are unaliased?
953+
954+ # This undoes a small amount of space savings from the DAG representation, but
955+ # it allows us to (1) omit the whole `edges` array (TODO ), and (2) make the
956+ # pruning algorithm simpler. The invariant we win is having `edge_ranges` be
957+ # one or more interleaved level-order traversals where every node's set of
958+ # children is contiguous, so its entries can refer to itself instead of `edges`.
959+ function prune_u (graph1_a:: SyntaxGraph , entrypoints_a:: Vector{NodeId} )
960+ @assert length (entrypoints_a) === length (unique (entrypoints_a))
961+ unaliased = unalias_nodes (SyntaxList (graph1_a, entrypoints_a))
962+ (graph1, entrypoints) = (unaliased. graph, unaliased. ids)
963+ nodes1 = NodeId[entrypoints... ] # Reachable subset of graph1
964+ map12 = Dict {NodeId, Int} () # graph1 => graph2 mapping
965+ graph2 = ensure_attributes! (SyntaxGraph (); attrtypes (graph1)... )
966+ while length (graph2. edge_ranges) < length (nodes1)
967+ n2 = length (graph2. edge_ranges) + 1
968+ n1 = nodes1[n2]
969+ map12[n1] = n2
970+ push! (graph2. edge_ranges, is_leaf (graph1, n1) ?
971+ (0 : - 1 ) : (1 : numchildren (graph1, n1)) .+ length (nodes1))
972+ for c1 in children (graph1, n1)
973+ push! (nodes1, c1)
974+ end
975+ end
976+ graph2. edges = 1 : length (nodes1) # our reward for unaliasing
977+
978+ for attr in attrnames (graph1)
979+ attr === :source && continue
980+ for (n2, n1) in enumerate (nodes1)
981+ if (begin
982+ attrval = get (graph1. attributes[attr], n1, nothing )
983+ ! isnothing (attrval)
984+ end )
985+ graph2. attributes[attr][n2] = attrval
986+ end
987+ end
988+ end
989+
990+ # Prune provenance. Tricky due to dangling `.source` references.
991+ resolved_sources = Dict {NodeId, SourceAttrType} () # graph1 => graph2
992+ function get_resolved! (id1:: NodeId )
993+ res = get (resolved_sources, id1, nothing )
994+ if isnothing (res)
995+ src1 = graph1. source[id1]
996+ res = if haskey (map12, src1)
997+ map12[src1]
998+ elseif src1 isa NodeId
999+ get_resolved! (src1)
1000+ elseif src1 isa Tuple
1001+ map (get_resolved!, src1)
1002+ elseif src1 isa SourceRef
1003+ SourceRef (src1. file, src1. first_byte, src1. last_byte, nothing )
1004+ else
1005+ src1
1006+ end
1007+ resolved_sources[id1] = res
1008+ end
1009+ return res
1010+ end
1011+
1012+ for (n2, n1) in enumerate (nodes1)
1013+ graph2. source[n2] = get_resolved! (n1)
1014+ end
1015+
1016+ # The first n entries in nodes1 were our entrypoints, unique from unaliasing
1017+ return SyntaxList (graph2, 1 : length (entrypoints))
1018+ end
1019+
1020+ const prune_impl = prune_u
0 commit comments