@@ -799,3 +799,223 @@ end
799799# out
800800# end
801801
802+ # -------------------------------------------------------------------------------
803+ # Data structure utilities
804+
805+ """
806+ unalias_nodes(st::SyntaxTree)
807+
808+ Return a tree where each descendent of `st` has exactly one parent in `st`. The
809+ returned tree is identical to `st` in all but underlying representation, where
810+ every additional parent to a subtree generates a copy of that subtree. Apart
811+ from this, `unalias_nodes!` should not allocate new nodes unnecessarily.
812+
813+ unalias_nodes(sl::SyntaxList)
814+
815+ If a `SyntaxList` is given, every resulting tree will be unique with respect to
816+ each other as well as internally. A duplicate id will produce a copied tree.
817+ """
818+ unalias_nodes (st:: SyntaxTree ) = SyntaxTree (
819+ syntax_graph (st),
820+ _unalias_nodes! (syntax_graph (st), st. _id, Set {NodeId} (), Set {Int} ()))
821+
822+ function unalias_nodes (sl:: SyntaxList )
823+ seen = Set {NodeId} ()
824+ seen_edges = Set {Int} ()
825+ SyntaxList (syntax_graph (sl),
826+ map (id-> _unalias_nodes! (syntax_graph (sl), id, seen, seen_edges), sl. ids))
827+ end
828+
829+ # Note that `seen_edges` is only needed for when edge ranges overlap, which is a
830+ # situation we don't produce yet.
831+ function _unalias_nodes! (graph:: SyntaxGraph , id:: NodeId , seen:: Set{NodeId} , seen_edges:: Set{Int} )
832+ if id in seen
833+ # all nodes from `copy_ast` are fresh/unreferenced, and distinct from
834+ # each other, so no further recursion is needed.
835+ return copy_ast (graph, SyntaxTree (graph, id); copy_source= false ). _id
836+ end
837+ if ! isempty (intersect (seen_edges, graph. edge_ranges[id]))
838+ # someone is referencing our edges; run away so we can modify them
839+ next_edge = length (graph. edges) + 1
840+ append! (graph. edges, children (graph, id))
841+ graph. edge_ranges[id] = next_edge: lastindex (graph. edges)
842+ end
843+ union! (seen_edges, graph. edge_ranges[id])
844+ push! (seen, id)
845+
846+ for (c, i) in zip (children (graph, id), graph. edge_ranges[id])
847+ c2 = _unalias_nodes! (graph, c, seen, seen_edges)
848+ # the new child should be the same in every way to the old one, so
849+ # modify the edge instead of triggering copies with `mapchildren`
850+ c != = c2 && (graph. edges[i] = c2)
851+ end
852+ return id
853+ end
854+
855+ """
856+ Give each descendent of `st` a `parent::NodeId` attribute.
857+ """
858+ function annotate_parent! (st:: SyntaxTree )
859+ g = unfreeze_attrs (syntax_graph (st))
860+ st = unalias_nodes (SyntaxTree (g, st. _id))
861+ ensure_attributes! (g; parent= NodeId)
862+ mapchildren (t-> _annotate_parent! (t, st. _id), syntax_graph (st), st)
863+ end
864+
865+ function _annotate_parent! (st:: SyntaxTree , pid:: NodeId )
866+ setattr! (st; parent= pid)
867+ mapchildren (t-> _annotate_parent! (t, st. _id), syntax_graph (st), st)
868+ end
869+
870+ """
871+ Return a tree where unreachable nodes (non-descendents of `st`) in its graph
872+ have been deleted, and where provenance data has been minimized.
873+
874+ If `keep` is not nothing, also consider descendents of it reachable. By
875+ default, `keep` is the final node(s) in the provenance chain of `st`. This
876+ means that, by default, we have expression provenance back to the original
877+ parsed nodes, but no lowering-internal provenance. In any case, we still retain
878+ byte (or, with old macros, LineNumberNode) provenance.
879+
880+ Provenance shrinkage: Green trees are omitted from SourceRefs. If node A
881+ references node B as its source and B is unreachable, A adopts the source of B.
882+
883+ No attributes are deleted (but that can be done separately). Possible TODO:
884+ Non-sparse attributes could be stored more compactly
885+ """
886+ function prune (st:: SyntaxTree ; keep:: Union{SyntaxTree, SyntaxList, Nothing} = flattened_provenance (st))
887+ entrypoints = NodeId[st. _id]
888+ keep isa SyntaxList && append! (entrypoints, keep. ids)
889+ keep isa SyntaxTree && push! (entrypoints, keep. _id)
890+ prune_impl (syntax_graph (st), unique (entrypoints))[1 ]
891+ end
892+
893+ function prune_a (graph1:: SyntaxGraph , entrypoints:: Vector{NodeId} )
894+ @assert length (entrypoints) === length (unique (entrypoints))
895+ nodes1 = NodeId[entrypoints... ] # note nodes encountered >once appear once
896+ map12 = Dict {NodeId, Int} ()
897+ graph2 = ensure_attributes! (SyntaxGraph (); attrtypes (graph1)... )
898+ while length (graph2. edge_ranges) < length (nodes1)
899+ n2 = length (graph2. edge_ranges) + 1
900+ n1 = nodes1[n2]
901+ map12[n1] = n2
902+ push! (graph2. edge_ranges, is_leaf (graph1, n1) ?
903+ (0 : - 1 ) : (1 : numchildren (graph1, n1)) .+ length (graph2. edges))
904+ for c1 in children (graph1, n1)
905+ if haskey (map12, c1)
906+ push! (graph2. edges, map12[c1])
907+ else
908+ push! (nodes1, c1)
909+ push! (graph2. edges, length (nodes1))
910+ end
911+ end
912+ end
913+
914+ for attr in attrnames (graph1)
915+ attr === :source && continue
916+ for (n2, n1) in enumerate (nodes1)
917+ if (begin
918+ attrval = get (graph1. attributes[attr], n1, nothing )
919+ ! isnothing (attrval)
920+ end )
921+ graph2. attributes[attr][n2] = attrval
922+ end
923+ end
924+ end
925+
926+ resolved_sources = Dict {NodeId, SourceAttrType} () # graph1 => graph2
927+ function get_resolved! (id1:: NodeId )
928+ res = get (resolved_sources, id1, nothing )
929+ if isnothing (res)
930+ src1 = graph1. source[id1]
931+ res = if haskey (map12, src1)
932+ map12[src1]
933+ elseif src1 isa NodeId
934+ get_resolved! (src1)
935+ elseif src1 isa Tuple
936+ map (get_resolved!, src1)
937+ elseif src1 isa SourceRef
938+ SourceRef (src1. file, src1. first_byte, src1. last_byte, nothing )
939+ else
940+ src1
941+ end
942+ resolved_sources[id1] = res
943+ end
944+ return res
945+ end
946+
947+ for (n2, n1) in enumerate (nodes1)
948+ graph2. source[n2] = get_resolved! (n1)
949+ end
950+ return SyntaxList (graph2, 1 : length (entrypoints))
951+ end
952+
953+ # Experiment: What can we gain if we are allowed to assume nodes are unaliased?
954+
955+ # This undoes a small amount of space savings from the DAG representation, but
956+ # it allows us to (1) omit the whole `edges` array (TODO ), and (2) make the
957+ # pruning algorithm simpler. The invariant we win is having `edge_ranges` be
958+ # one or more interleaved level-order traversals where every node's set of
959+ # children is contiguous, so its entries can refer to itself instead of `edges`.
960+ function prune_u (graph1_a:: SyntaxGraph , entrypoints_a:: Vector{NodeId} )
961+ @assert length (entrypoints_a) === length (unique (entrypoints_a))
962+ unaliased = unalias_nodes (SyntaxList (graph1_a, entrypoints_a))
963+ (graph1, entrypoints) = (unaliased. graph, unaliased. ids)
964+ nodes1 = NodeId[entrypoints... ] # Reachable subset of graph1
965+ map12 = Dict {NodeId, Int} () # graph1 => graph2 mapping
966+ graph2 = ensure_attributes! (SyntaxGraph (); attrtypes (graph1)... )
967+ while length (graph2. edge_ranges) < length (nodes1)
968+ n2 = length (graph2. edge_ranges) + 1
969+ n1 = nodes1[n2]
970+ map12[n1] = n2
971+ push! (graph2. edge_ranges, is_leaf (graph1, n1) ?
972+ (0 : - 1 ) : (1 : numchildren (graph1, n1)) .+ length (nodes1))
973+ for c1 in children (graph1, n1)
974+ push! (nodes1, c1)
975+ end
976+ end
977+ graph2. edges = 1 : length (nodes1) # our reward for unaliasing
978+
979+ for attr in attrnames (graph1)
980+ attr === :source && continue
981+ for (n2, n1) in enumerate (nodes1)
982+ if (begin
983+ attrval = get (graph1. attributes[attr], n1, nothing )
984+ ! isnothing (attrval)
985+ end )
986+ graph2. attributes[attr][n2] = attrval
987+ end
988+ end
989+ end
990+
991+ # Prune provenance. Tricky due to dangling `.source` references.
992+ resolved_sources = Dict {NodeId, SourceAttrType} () # graph1 => graph2
993+ function get_resolved! (id1:: NodeId )
994+ res = get (resolved_sources, id1, nothing )
995+ if isnothing (res)
996+ src1 = graph1. source[id1]
997+ res = if haskey (map12, src1)
998+ map12[src1]
999+ elseif src1 isa NodeId
1000+ get_resolved! (src1)
1001+ elseif src1 isa Tuple
1002+ map (get_resolved!, src1)
1003+ elseif src1 isa SourceRef
1004+ SourceRef (src1. file, src1. first_byte, src1. last_byte, nothing )
1005+ else
1006+ src1
1007+ end
1008+ resolved_sources[id1] = res
1009+ end
1010+ return res
1011+ end
1012+
1013+ for (n2, n1) in enumerate (nodes1)
1014+ graph2. source[n2] = get_resolved! (n1)
1015+ end
1016+
1017+ # The first n entries in nodes1 were our entrypoints, unique from unaliasing
1018+ return SyntaxList (graph2, 1 : length (entrypoints))
1019+ end
1020+
1021+ const prune_impl = prune_u
0 commit comments