@@ -763,3 +763,218 @@ end
763763# out
764764# end
765765
766+ # -------------------------------------------------------------------------------
767+ # Data structure utilities
768+
769+ """
770+ unalias_nodes(st::SyntaxTree)
771+
772+ Return a tree where each descendent of `st` has exactly one parent in `st`. The
773+ returned tree is identical to `st` in all but underlying representation, where
774+ every additional parent to a subtree generates a copy of that subtree. Apart
775+ from this, `unalias_nodes!` should not allocate new nodes unnecessarily.
776+
777+ unalias_nodes(sl::SyntaxList)
778+
779+ If a `SyntaxList` is given, every resulting tree will be unique with respect to
780+ each other as well as internally. A duplicate id will produce a copied tree.
781+ """
782+ unalias_nodes (st:: SyntaxTree ) = SyntaxTree (
783+ syntax_graph (st),
784+ _unalias_nodes! (syntax_graph (st), st. _id, Set {NodeId} (), Set {Int} ()))
785+
786+ function unalias_nodes (sl:: SyntaxList )
787+ seen = Set {NodeId} ()
788+ seen_edges = Set {Int} ()
789+ SyntaxList (syntax_graph (sl),
790+ map (id-> _unalias_nodes! (syntax_graph (sl), id, seen, seen_edges), sl. ids))
791+ end
792+
793+ # Note that `seen_edges` is only needed for when edge ranges overlap, which is a
794+ # situation we don't produce yet.
795+ function _unalias_nodes! (graph:: SyntaxGraph , id:: NodeId , seen:: Set{NodeId} , seen_edges:: Set{Int} )
796+ if id in seen
797+ # all nodes from `copy_ast` are fresh/unreferenced, and distinct from
798+ # each other, so no further recursion is needed.
799+ return copy_ast (graph, SyntaxTree (graph, id); copy_source= false ). _id
800+ end
801+ if ! isempty (intersect (seen_edges, graph. edge_ranges[id]))
802+ # someone is referencing our edges; run away so we can modify them
803+ next_edge = length (graph. edges) + 1
804+ append! (graph. edges, children (graph, id))
805+ graph. edge_ranges[id] = next_edge: lastindex (graph. edges)
806+ end
807+ union! (seen_edges, graph. edge_ranges[id])
808+ push! (seen, id)
809+
810+ for (c, i) in zip (children (graph, id), graph. edge_ranges[id])
811+ c2 = _unalias_nodes! (graph, c, seen, seen_edges)
812+ # the new child should be the same in every way to the old one, so
813+ # modify the edge instead of triggering copies with `mapchildren`
814+ c != = c2 && (graph. edges[i] = c2)
815+ end
816+ return id
817+ end
818+
819+ """
820+ Give each descendent of `st` a `parent::NodeId` attribute.
821+ """
822+ function annotate_parent! (st:: SyntaxTree )
823+ g = unfreeze_attrs (syntax_graph (st))
824+ st = unalias_nodes (SyntaxTree (g, st. _id))
825+ ensure_attributes! (g; parent= NodeId)
826+ mapchildren (t-> _annotate_parent! (t, st. _id), syntax_graph (st), st)
827+ end
828+
829+ function _annotate_parent! (st:: SyntaxTree , pid:: NodeId )
830+ setattr! (st; parent= pid)
831+ mapchildren (t-> _annotate_parent! (t, st. _id), syntax_graph (st), st)
832+ end
833+
834+ """
835+ Return a tree where unreachable nodes (non-descendents of `st`) in its graph
836+ have been deleted, and where provenance data has been minimized.
837+
838+ If `keep` is not nothing, also consider descendents of it reachable. By
839+ default, `keep` is the final node(s) in the provenance chain of `st`. This
840+ means that, by default, we have expression provenance back to the original
841+ parsed nodes, but no lowering-internal provenance. In any case, we still retain
842+ byte (or, with old macros, LineNumberNode) provenance.
843+
844+ Provenance shrinkage: Green trees are omitted from SourceRefs. If node A
845+ references node B as its source and B is unreachable, A adopts the source of B.
846+
847+ No attributes are deleted (but that can be done separately). Possible TODO:
848+ Non-sparse attributes could be stored more compactly
849+ """
850+ function prune (st:: SyntaxTree ; keep:: Union{SyntaxTree, SyntaxList, Nothing} = flattened_provenance (st))
851+ entrypoints = NodeId[st. _id]
852+ keep isa SyntaxList && append! (entrypoints, keep. ids)
853+ keep isa SyntaxTree && push! (entrypoints, keep. _id)
854+ prune_impl (syntax_graph (st), entrypoints)[1 ]
855+ end
856+
857+ function prune_a (graph1:: SyntaxGraph , entrypoints:: Vector{NodeId} )
858+ entrypoints = unique (entrypoints)
859+ nodes1 = NodeId[entrypoints... ] # note nodes encountered >once appear once
860+ map12 = Dict {NodeId, Int} ()
861+ graph2 = ensure_attributes! (SyntaxGraph (); attrtypes (graph1)... )
862+ while length (graph2. edge_ranges) < length (nodes1)
863+ n2 = length (graph2. edge_ranges) + 1
864+ n1 = nodes1[n2]
865+ map12[n1] = n2
866+ push! (graph2. edge_ranges, is_leaf (graph1, n1) ?
867+ (0 : - 1 ) : (1 : numchildren (graph1, n1) .+ length (graph2. edges)))
868+ for c1 in children (graph1, n1)
869+ ! haskey (c1, map12) && push! (nodes1, c1)
870+ push! (graph2. edges, map12[c1])
871+ end
872+ end
873+
874+ for attr in attrnames (graph1)
875+ attr === :source && continue
876+ for (n2, n1) in enumerate (nodes1)
877+ if (begin
878+ attrval = get (graph1. attributes[attr], n1, nothing )
879+ ! isnothing (attrval)
880+ end )
881+ graph2. attributes[attr][n2] = attrval
882+ end
883+ end
884+ end
885+
886+ resolved_sources = Dict {NodeId, SourceAttrType} () # graph1 => graph2
887+ function get_resolved! (id1:: NodeId )
888+ res = get (resolved_sources, id1, nothing )
889+ if isnothing (res)
890+ src1 = graph1. source[id1]
891+ res = if haskey (map12, src1)
892+ map12[src1]
893+ elseif src1 isa NodeId
894+ get_resolved! (src1)
895+ elseif src1 isa Tuple
896+ map (get_resolved!, src1)
897+ elseif src1 isa SourceRef
898+ SourceRef (src1. file, src1. first_byte, src1. last_byte, nothing )
899+ else
900+ src1
901+ end
902+ resolved_sources[id1] = res
903+ end
904+ return res
905+ end
906+
907+ for (n2, n1) in enumerate (nodes1)
908+ graph2. source[n2] = get_resolved! (n1)
909+ end
910+ return SyntaxList (graph2, map (id-> map12[id], entrypoints))
911+ end
912+
913+ # Experiment: What can we gain if we are allowed to assume nodes are unaliased?
914+
915+ # This undoes a small amount of space savings from the DAG representation, but
916+ # it allows us to (1) omit the whole `edges` array (TODO ), and (2) make the
917+ # pruning algorithm simpler. The invariant we win is having `edge_ranges` be
918+ # one or more interleaved level-order traversals where every node's set of
919+ # children is contiguous, so its entries can refer to itself instead of `edges`.
920+ function prune_u (graph1_a:: SyntaxGraph , entrypoints_a:: Vector{NodeId} )
921+ unaliased = unalias_nodes (SyntaxList (graph1_a, entrypoints_a))
922+ (graph1, entrypoints) = (unaliased. graph, unaliased. ids)
923+ nodes1 = NodeId[entrypoints... ] # Reachable subset of graph1
924+ map12 = Dict {NodeId, Int} () # graph1 => graph2 mapping
925+ graph2 = ensure_attributes! (SyntaxGraph (); attrtypes (graph1)... )
926+ while length (graph2. edge_ranges) < length (nodes1)
927+ n2 = length (graph2. edge_ranges) + 1
928+ n1 = nodes1[n2]
929+ map12[n1] = n2
930+ push! (graph2. edge_ranges, is_leaf (graph1, n1) ?
931+ (0 : - 1 ) : (1 : numchildren (graph1, n1)) .+ length (nodes1))
932+ for c1 in children (graph1, n1)
933+ push! (nodes1, c1)
934+ end
935+ end
936+ graph2. edges = 1 : length (nodes1) # our reward for unaliasing
937+
938+ for attr in attrnames (graph1)
939+ attr === :source && continue
940+ for (n2, n1) in enumerate (nodes1)
941+ if (begin
942+ attrval = get (graph1. attributes[attr], n1, nothing )
943+ ! isnothing (attrval)
944+ end )
945+ graph2. attributes[attr][n2] = attrval
946+ end
947+ end
948+ end
949+
950+ # Prune provenance. Tricky due to dangling `.source` references.
951+ resolved_sources = Dict {NodeId, SourceAttrType} () # graph1 => graph2
952+ function get_resolved! (id1:: NodeId )
953+ res = get (resolved_sources, id1, nothing )
954+ if isnothing (res)
955+ src1 = graph1. source[id1]
956+ res = if haskey (map12, src1)
957+ map12[src1]
958+ elseif src1 isa NodeId
959+ get_resolved! (src1)
960+ elseif src1 isa Tuple
961+ map (get_resolved!, src1)
962+ elseif src1 isa SourceRef
963+ SourceRef (src1. file, src1. first_byte, src1. last_byte, nothing )
964+ else
965+ src1
966+ end
967+ resolved_sources[id1] = res
968+ end
969+ return res
970+ end
971+
972+ for (n2, n1) in enumerate (nodes1)
973+ graph2. source[n2] = get_resolved! (n1)
974+ end
975+
976+ # The first n entries in nodes1 were our entrypoints, unique from unaliasing
977+ return SyntaxList (graph2, 1 : length (entrypoints))
978+ end
979+
980+ const prune_impl = prune_u
0 commit comments