@@ -761,3 +761,218 @@ end
761761# out
762762# end
763763
764+ # -------------------------------------------------------------------------------
765+ # Data structure utilities
766+
767+ """
768+ unalias_nodes(st::SyntaxTree)
769+
770+ Return a tree where each descendent of `st` has exactly one parent in `st`. The
771+ returned tree is identical to `st` in all but underlying representation, where
772+ every additional parent to a subtree generates a copy of that subtree. Apart
773+ from this, `unalias_nodes!` should not allocate new nodes unnecessarily.
774+
775+ unalias_nodes(sl::SyntaxList)
776+
777+ If a `SyntaxList` is given, every resulting tree will be unique with respect to
778+ each other as well as internally. A duplicate id will produce a copied tree.
779+ """
780+ unalias_nodes (st:: SyntaxTree ) = SyntaxTree (
781+ syntax_graph (st),
782+ _unalias_nodes! (syntax_graph (st), st. _id, Set {NodeId} (), Set {Int} ()))
783+
784+ function unalias_nodes (sl:: SyntaxList )
785+ seen = Set {NodeId} ()
786+ seen_edges = Set {Int} ()
787+ SyntaxList (syntax_graph (sl),
788+ map (id-> _unalias_nodes! (syntax_graph (sl), id, seen, seen_edges), sl. ids))
789+ end
790+
791+ # Note that `seen_edges` is only needed for when edge ranges overlap, which is a
792+ # situation we don't produce yet.
793+ function _unalias_nodes! (graph:: SyntaxGraph , id:: NodeId , seen:: Set{NodeId} , seen_edges:: Set{Int} )
794+ if id in seen
795+ # all nodes from `copy_ast` are fresh/unreferenced, and distinct from
796+ # each other, so no further recursion is needed.
797+ return copy_ast (graph, SyntaxTree (graph, id); copy_source= false ). _id
798+ end
799+ if ! isempty (intersect (seen_edges, graph. edge_ranges[id]))
800+ # someone is referencing our edges; run away so we can modify them
801+ next_edge = length (graph. edges) + 1
802+ append! (graph. edges, children (graph, id))
803+ graph. edge_ranges[id] = next_edge: lastindex (graph. edges)
804+ end
805+ union! (seen_edges, graph. edge_ranges[id])
806+ push! (seen, id)
807+
808+ for (c, i) in zip (children (graph, id), graph. edge_ranges[id])
809+ c2 = _unalias_nodes! (graph, c, seen, seen_edges)
810+ # the new child should be the same in every way to the old one, so
811+ # modify the edge instead of triggering copies with `mapchildren`
812+ c != = c2 && (graph. edges[i] = c2)
813+ end
814+ return id
815+ end
816+
817+ """
818+ Give each descendent of `st` a `parent::NodeId` attribute.
819+ """
820+ function annotate_parent! (st:: SyntaxTree )
821+ g = unfreeze_attrs (syntax_graph (st))
822+ st = unalias_nodes (SyntaxTree (g, st. _id))
823+ ensure_attributes! (g; parent= NodeId)
824+ mapchildren (t-> _annotate_parent! (t, st. _id), syntax_graph (st), st)
825+ end
826+
827+ function _annotate_parent! (st:: SyntaxTree , pid:: NodeId )
828+ setattr! (st; parent= pid)
829+ mapchildren (t-> _annotate_parent! (t, st. _id), syntax_graph (st), st)
830+ end
831+
832+ """
833+ Return a tree where unreachable nodes (non-descendents of `st`) in its graph
834+ have been deleted, and where provenance data has been minimized.
835+
836+ If `keep` is not nothing, also consider descendents of it reachable. By
837+ default, `keep` is the final node(s) in the provenance chain of `st`. This
838+ means that, by default, we have expression provenance back to the original
839+ parsed nodes, but no lowering-internal provenance. In any case, we still retain
840+ byte (or, with old macros, LineNumberNode) provenance.
841+
842+ Provenance shrinkage: Green trees are omitted from SourceRefs. If node A
843+ references node B as its source and B is unreachable, A adopts the source of B.
844+
845+ No attributes are deleted (but that can be done separately). Possible TODO:
846+ Non-sparse attributes could be stored more compactly
847+ """
848+ function prune (st:: SyntaxTree ; keep:: Union{SyntaxTree, SyntaxList, Nothing} = flattened_provenance (st))
849+ entrypoints = NodeId[st. _id]
850+ keep isa SyntaxList && append! (entrypoints, keep. ids)
851+ keep isa SyntaxTree && push! (entrypoints, keep. _id)
852+ prune_impl (syntax_graph (st), entrypoints)[1 ]
853+ end
854+
855+ function prune_a (graph1:: SyntaxGraph , entrypoints:: Vector{NodeId} )
856+ entrypoints = unique (entrypoints)
857+ nodes1 = NodeId[entrypoints... ] # note nodes encountered >once appear once
858+ map12 = Dict {NodeId, Int} ()
859+ graph2 = ensure_attributes! (SyntaxGraph (); attrtypes (graph1)... )
860+ while length (graph2. edge_ranges) < length (nodes1)
861+ n2 = length (graph2. edge_ranges) + 1
862+ n1 = nodes1[n2]
863+ map12[n1] = n2
864+ push! (graph2. edge_ranges, is_leaf (graph1, n1) ?
865+ (0 : - 1 ) : (1 : numchildren (graph1, n1) .+ length (graph2. edges)))
866+ for c1 in children (graph1, n1)
867+ ! haskey (c1, map12) && push! (nodes1, c1)
868+ push! (graph2. edges, map12[c1])
869+ end
870+ end
871+
872+ for attr in attrnames (graph1)
873+ attr === :source && continue
874+ for (n2, n1) in enumerate (nodes1)
875+ if (begin
876+ attrval = get (graph1. attributes[attr], n1, nothing )
877+ ! isnothing (attrval)
878+ end )
879+ graph2. attributes[attr][n2] = attrval
880+ end
881+ end
882+ end
883+
884+ resolved_sources = Dict {NodeId, SourceAttrType} () # graph1 => graph2
885+ function get_resolved! (id1:: NodeId )
886+ res = get (resolved_sources, id1, nothing )
887+ if isnothing (res)
888+ src1 = graph1. source[id1]
889+ res = if haskey (map12, src1)
890+ map12[src1]
891+ elseif src1 isa NodeId
892+ get_resolved! (src1)
893+ elseif src1 isa Tuple
894+ map (get_resolved!, src1)
895+ elseif src1 isa SourceRef
896+ SourceRef (src1. file, src1. first_byte, src1. last_byte, nothing )
897+ else
898+ src1
899+ end
900+ resolved_sources[id1] = res
901+ end
902+ return res
903+ end
904+
905+ for (n2, n1) in enumerate (nodes1)
906+ graph2. source[n2] = get_resolved! (n1)
907+ end
908+ return SyntaxList (graph2, map (id-> map12[id], entrypoints))
909+ end
910+
911+ # Experiment: What can we gain if we are allowed to assume nodes are unaliased?
912+
913+ # This undoes a small amount of space savings from the DAG representation, but
914+ # it allows us to (1) omit the whole `edges` array (TODO ), and (2) make the
915+ # pruning algorithm simpler. The invariant we win is having `edge_ranges` be
916+ # one or more interleaved level-order traversals where every node's set of
917+ # children is contiguous, so its entries can refer to itself instead of `edges`.
918+ function prune_u (graph1_a:: SyntaxGraph , entrypoints_a:: Vector{NodeId} )
919+ unaliased = unalias_nodes (SyntaxList (graph1_a, entrypoints_a))
920+ (graph1, entrypoints) = (unaliased. graph, unaliased. ids)
921+ nodes1 = NodeId[entrypoints... ] # Reachable subset of graph1
922+ map12 = Dict {NodeId, Int} () # graph1 => graph2 mapping
923+ graph2 = ensure_attributes! (SyntaxGraph (); attrtypes (graph1)... )
924+ while length (graph2. edge_ranges) < length (nodes1)
925+ n2 = length (graph2. edge_ranges) + 1
926+ n1 = nodes1[n2]
927+ map12[n1] = n2
928+ push! (graph2. edge_ranges, is_leaf (graph1, n1) ?
929+ (0 : - 1 ) : (1 : numchildren (graph1, n1)) .+ length (nodes1))
930+ for c1 in children (graph1, n1)
931+ push! (nodes1, c1)
932+ end
933+ end
934+ graph2. edges = 1 : length (nodes1) # our reward for unaliasing
935+
936+ for attr in attrnames (graph1)
937+ attr === :source && continue
938+ for (n2, n1) in enumerate (nodes1)
939+ if (begin
940+ attrval = get (graph1. attributes[attr], n1, nothing )
941+ ! isnothing (attrval)
942+ end )
943+ graph2. attributes[attr][n2] = attrval
944+ end
945+ end
946+ end
947+
948+ # Prune provenance. Tricky due to dangling `.source` references.
949+ resolved_sources = Dict {NodeId, SourceAttrType} () # graph1 => graph2
950+ function get_resolved! (id1:: NodeId )
951+ res = get (resolved_sources, id1, nothing )
952+ if isnothing (res)
953+ src1 = graph1. source[id1]
954+ res = if haskey (map12, src1)
955+ map12[src1]
956+ elseif src1 isa NodeId
957+ get_resolved! (src1)
958+ elseif src1 isa Tuple
959+ map (get_resolved!, src1)
960+ elseif src1 isa SourceRef
961+ SourceRef (src1. file, src1. first_byte, src1. last_byte, nothing )
962+ else
963+ src1
964+ end
965+ resolved_sources[id1] = res
966+ end
967+ return res
968+ end
969+
970+ for (n2, n1) in enumerate (nodes1)
971+ graph2. source[n2] = get_resolved! (n1)
972+ end
973+
974+ # The first n entries in nodes1 were our entrypoints, unique from unaliasing
975+ return SyntaxList (graph2, 1 : length (entrypoints))
976+ end
977+
978+ const prune_impl = prune_u
0 commit comments