@@ -89,6 +89,7 @@ STATISTIC(FoundProfiledCalleeMaxDepth,
8989 " Maximum depth of profiled callees found via tail calls" );
9090STATISTIC (FoundProfiledCalleeNonUniquelyCount,
9191 " Number of profiled callees found via multiple tail call chains" );
92+ STATISTIC (DeferredBackedges, " Number of backedges with deferred cloning" );
9293
9394static cl::opt<std::string> DotFilePathPrefix (
9495 " memprof-dot-file-path-prefix" , cl::init(" " ), cl::Hidden,
@@ -127,14 +128,18 @@ static cl::opt<bool> AllowRecursiveCallsites(
127128 " memprof-allow-recursive-callsites" , cl::init(true ), cl::Hidden,
128129 cl::desc(" Allow cloning of callsites involved in recursive cycles" ));
129130
131+ static cl::opt<bool > CloneRecursiveContexts (
132+ " memprof-clone-recursive-contexts" , cl::init(true ), cl::Hidden,
133+ cl::desc(" Allow cloning of contexts through recursive cycles" ));
134+
130135// When disabled, try to detect and prevent cloning of recursive contexts.
131136// This is only necessary until we support cloning through recursive cycles.
132137// Leave on by default for now, as disabling requires a little bit of compile
133138// time overhead and doesn't affect correctness, it will just inflate the cold
134139// hinted bytes reporting a bit when -memprof-report-hinted-sizes is enabled.
135140static cl::opt<bool > AllowRecursiveContexts (
136141 " memprof-allow-recursive-contexts" , cl::init(true ), cl::Hidden,
137- cl::desc(" Allow cloning of contexts through recursive cycles" ));
142+ cl::desc(" Allow cloning of contexts having recursive cycles" ));
138143
139144namespace llvm {
140145cl::opt<bool > EnableMemProfContextDisambiguation (
@@ -293,37 +298,35 @@ class CallsiteContextGraph {
293298 // TODO: Should this be a map (from Caller node) for more efficient lookup?
294299 std::vector<std::shared_ptr<ContextEdge>> CallerEdges;
295300
296- // Get the list of edges from which we can compute allocation information
297- // such as the context ids and allocation type of this node.
298- const std::vector<std::shared_ptr<ContextEdge>> *
299- getEdgesWithAllocInfo () const {
300- // If node has any callees, compute from those, otherwise compute from
301- // callers (i.e. if this is the leaf allocation node).
302- if (!CalleeEdges.empty ())
303- return &CalleeEdges;
301+ // Returns true if we need to look at the callee edges for determining the
302+ // node context ids and allocation type.
303+ bool useCallerEdgesForContextInfo () const {
304304 // Typically if the callee edges are empty either the caller edges are
305305 // also empty, or this is an allocation (leaf node). However, if we are
306306 // allowing recursive callsites and contexts this will be violated for
307307 // incompletely cloned recursive cycles.
308- assert (CallerEdges.empty () || IsAllocation ||
308+ assert (!CalleeEdges. empty () || CallerEdges.empty () || IsAllocation ||
309309 (AllowRecursiveCallsites && AllowRecursiveContexts));
310- if (!CallerEdges.empty () && IsAllocation)
311- return &CallerEdges;
312- return nullptr ;
310+ // When cloning for a recursive context, during cloning we might be in the
311+ // midst of cloning for a recurrence and have moved context ids off of a
312+ // caller edge onto the clone but not yet off of the incoming caller
313+ // (back) edge. If we don't look at those we miss the fact that this node
314+ // still has context ids of interest.
315+ return IsAllocation || CloneRecursiveContexts;
313316 }
314317
315318 // Compute the context ids for this node from the union of its edge context
316319 // ids.
317320 DenseSet<uint32_t > getContextIds () const {
318- DenseSet<uint32_t > ContextIds;
319- auto *Edges = getEdgesWithAllocInfo ();
320- if (!Edges)
321- return {};
322321 unsigned Count = 0 ;
323- for (auto &Edge : *Edges )
322+ for (auto &Edge : CalleeEdges. empty () ? CallerEdges : CalleeEdges )
324323 Count += Edge->getContextIds ().size ();
324+ DenseSet<uint32_t > ContextIds;
325325 ContextIds.reserve (Count);
326- for (auto &Edge : *Edges)
326+ std::vector<std::shared_ptr<ContextEdge>> Empty;
327+ auto Edges = llvm::concat<const std::shared_ptr<ContextEdge>>(
328+ CalleeEdges, useCallerEdgesForContextInfo () ? CallerEdges : Empty);
329+ for (const auto &Edge : Edges)
327330 ContextIds.insert (Edge->getContextIds ().begin (),
328331 Edge->getContextIds ().end ());
329332 return ContextIds;
@@ -332,13 +335,13 @@ class CallsiteContextGraph {
332335 // Compute the allocation type for this node from the OR of its edge
333336 // allocation types.
334337 uint8_t computeAllocType () const {
335- auto *Edges = getEdgesWithAllocInfo ();
336- if (!Edges)
337- return (uint8_t )AllocationType::None;
338338 uint8_t BothTypes =
339339 (uint8_t )AllocationType::Cold | (uint8_t )AllocationType::NotCold;
340340 uint8_t AllocType = (uint8_t )AllocationType::None;
341- for (auto &Edge : *Edges) {
341+ std::vector<std::shared_ptr<ContextEdge>> Empty;
342+ auto Edges = llvm::concat<const std::shared_ptr<ContextEdge>>(
343+ CalleeEdges, useCallerEdgesForContextInfo () ? CallerEdges : Empty);
344+ for (const auto &Edge : Edges) {
342345 AllocType |= Edge->AllocTypes ;
343346 // Bail early if alloc type reached both, no further refinement.
344347 if (AllocType == BothTypes)
@@ -350,10 +353,10 @@ class CallsiteContextGraph {
350353 // The context ids set for this node is empty if its edge context ids are
351354 // also all empty.
352355 bool emptyContextIds () const {
353- auto *Edges = getEdgesWithAllocInfo () ;
354- if (! Edges)
355- return true ;
356- for (auto &Edge : * Edges) {
356+ std::vector<std::shared_ptr<ContextEdge>> Empty ;
357+ auto Edges = llvm::concat< const std::shared_ptr<ContextEdge>>(
358+ CalleeEdges, useCallerEdgesForContextInfo () ? CallerEdges : Empty) ;
359+ for (const auto &Edge : Edges) {
357360 if (!Edge->getContextIds ().empty ())
358361 return false ;
359362 }
@@ -434,6 +437,14 @@ class CallsiteContextGraph {
434437 // for contexts including this edge.
435438 uint8_t AllocTypes = 0 ;
436439
440+ // Set just before initiating cloning when cloning of recursive contexts is
441+ // enabled. Used to defer cloning of backedges until we have done cloning of
442+ // the callee node for non-backedge caller edges. This exposes cloning
443+ // opportunities through the backedge of the cycle.
444+ // TODO: Note that this is not updated during cloning, and it is unclear
445+ // whether that would be needed.
446+ bool IsBackedge = false ;
447+
437448 // The set of IDs for contexts including this edge.
438449 DenseSet<uint32_t > ContextIds;
439450
@@ -722,6 +733,9 @@ class CallsiteContextGraph {
722733 void moveCalleeEdgeToNewCaller (const std::shared_ptr<ContextEdge> &Edge,
723734 ContextNode *NewCaller);
724735
736+ void markBackedges (ContextNode *Node, DenseSet<const ContextNode *> &Visited,
737+ DenseSet<const ContextNode *> &CurrentStack);
738+
725739 // / Recursively perform cloning on the graph for the given Node and its
726740 // / callers, in order to uniquely identify the allocation behavior of an
727741 // / allocation given its context. The context ids of the allocation being
@@ -2874,6 +2888,7 @@ template <typename DerivedCCG, typename FuncTy, typename CallTy>
28742888void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::print(
28752889 raw_ostream &OS) const {
28762890 OS << " Edge from Callee " << Callee << " to Caller: " << Caller
2891+ << (IsBackedge ? " (BE)" : " " )
28772892 << " AllocTypes: " << getAllocTypeString (AllocTypes);
28782893 OS << " ContextIds:" ;
28792894 std::vector<uint32_t > SortedIds (ContextIds.begin (), ContextIds.end ());
@@ -3115,6 +3130,8 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
31153130 // node (Edge's current callee may be the original node too).
31163131 assert (NewCallee->getOrigNode () == Edge->Callee ->getOrigNode ());
31173132
3133+ bool EdgeIsRecursive = Edge->Callee == Edge->Caller ;
3134+
31183135 ContextNode *OldCallee = Edge->Callee ;
31193136
31203137 // We might already have an edge to the new callee from earlier cloning for a
@@ -3181,8 +3198,16 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
31813198 // If this is a direct recursion edge, use NewCallee (the clone) as the
31823199 // callee as well, so that any edge updated/created here is also direct
31833200 // recursive.
3184- if (CalleeToUse == OldCallee)
3201+ if (CalleeToUse == OldCallee) {
3202+ // If this is a recursive edge, see if we already moved a recursive edge
3203+ // (which would have to have been this one) - if we were only moving a
3204+ // subset of context ids it would still be on OldCallee.
3205+ if (EdgeIsRecursive) {
3206+ assert (OldCalleeEdge == Edge);
3207+ continue ;
3208+ }
31853209 CalleeToUse = NewCallee;
3210+ }
31863211 // The context ids moving to the new callee are the subset of this edge's
31873212 // context ids and the context ids on the caller edge being moved.
31883213 DenseSet<uint32_t > EdgeContextIdsToMove =
@@ -3369,9 +3394,47 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
33693394 }
33703395}
33713396
3397+ // This is the standard DFS based backedge discovery algorithm.
3398+ template <typename DerivedCCG, typename FuncTy, typename CallTy>
3399+ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::markBackedges(
3400+ ContextNode *Node, DenseSet<const ContextNode *> &Visited,
3401+ DenseSet<const ContextNode *> &CurrentStack) {
3402+ auto I = Visited.insert (Node);
3403+ // We should only call this for unvisited nodes.
3404+ assert (I.second );
3405+ for (auto &CalleeEdge : Node->CalleeEdges ) {
3406+ auto *Callee = CalleeEdge->Callee ;
3407+ if (Visited.count (Callee)) {
3408+ // Since this was already visited we need to check if it is currently on
3409+ // the recursive stack in which case it is a backedge.
3410+ if (CurrentStack.count (Callee))
3411+ CalleeEdge->IsBackedge = true ;
3412+ continue ;
3413+ }
3414+ CurrentStack.insert (Callee);
3415+ markBackedges (Callee, Visited, CurrentStack);
3416+ CurrentStack.erase (Callee);
3417+ }
3418+ }
3419+
33723420template <typename DerivedCCG, typename FuncTy, typename CallTy>
33733421void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones() {
3422+ // If we are cloning recursive contexts, find and mark backedges from all root
3423+ // callers, using the typical DFS based backedge analysis.
33743424 DenseSet<const ContextNode *> Visited;
3425+ if (CloneRecursiveContexts) {
3426+ DenseSet<const ContextNode *> CurrentStack;
3427+ for (auto &Entry : NonAllocationCallToContextNodeMap) {
3428+ auto *Node = Entry.second ;
3429+ if (Node->isRemoved ())
3430+ continue ;
3431+ // It is a root if it doesn't have callers.
3432+ if (!Node->CallerEdges .empty ())
3433+ continue ;
3434+ markBackedges (Node, Visited, CurrentStack);
3435+ assert (CurrentStack.empty ());
3436+ }
3437+ }
33753438 for (auto &Entry : AllocationCallToContextNodeMap) {
33763439 Visited.clear ();
33773440 identifyClones (Entry.second , Visited, Entry.second ->getContextIds ());
@@ -3430,6 +3493,14 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones(
34303493 assert (!is_contained (Node->CallerEdges , Edge));
34313494 continue ;
34323495 }
3496+ // Defer backedges. See comments further below where these edges are
3497+ // handled during the cloning of this Node.
3498+ if (Edge->IsBackedge ) {
3499+ // We should only mark these if cloning recursive contexts, where we
3500+ // need to do this deferral.
3501+ assert (CloneRecursiveContexts);
3502+ continue ;
3503+ }
34333504 // Ignore any caller we previously visited via another edge.
34343505 if (!Visited.count (Edge->Caller ) && !Edge->Caller ->CloneOf ) {
34353506 identifyClones (Edge->Caller , Visited, AllocContextIds);
@@ -3483,6 +3554,7 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones(
34833554 assert (Node->AllocTypes != (uint8_t )AllocationType::None);
34843555
34853556 DenseSet<uint32_t > RecursiveContextIds;
3557+ assert (AllowRecursiveContexts || !CloneRecursiveContexts);
34863558 // If we are allowing recursive callsites, but have also disabled recursive
34873559 // contexts, look for context ids that show up in multiple caller edges.
34883560 if (AllowRecursiveCallsites && !AllowRecursiveContexts) {
@@ -3505,6 +3577,13 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones(
35053577 // makes it less error-prone.
35063578 auto CallerEdges = Node->CallerEdges ;
35073579 for (auto &CallerEdge : CallerEdges) {
3580+ // Skip any that have been removed by an earlier recursive call.
3581+ if (CallerEdge->isRemoved ()) {
3582+ assert (!is_contained (Node->CallerEdges , CallerEdge));
3583+ continue ;
3584+ }
3585+ assert (CallerEdge->Callee == Node);
3586+
35083587 // See if cloning the prior caller edge left this node with a single alloc
35093588 // type or a single caller. In that case no more cloning of Node is needed.
35103589 if (hasSingleAllocType (Node->AllocTypes ) || Node->CallerEdges .size () <= 1 )
@@ -3546,13 +3625,99 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones(
35463625 //
35473626 // Then check if by cloning node at least one of the callee edges will be
35483627 // disambiguated by splitting out different context ids.
3628+ //
3629+ // However, always do the cloning if this is a backedge, in which case we
3630+ // have not yet cloned along this caller edge.
35493631 assert (CallerEdge->AllocTypes != (uint8_t )AllocationType::None);
35503632 assert (Node->AllocTypes != (uint8_t )AllocationType::None);
3551- if (allocTypeToUse (CallerAllocTypeForAlloc) ==
3633+ if (!CallerEdge->IsBackedge &&
3634+ allocTypeToUse (CallerAllocTypeForAlloc) ==
35523635 allocTypeToUse (Node->AllocTypes ) &&
35533636 allocTypesMatch<DerivedCCG, FuncTy, CallTy>(
3554- CalleeEdgeAllocTypesForCallerEdge, Node->CalleeEdges ))
3637+ CalleeEdgeAllocTypesForCallerEdge, Node->CalleeEdges )) {
35553638 continue ;
3639+ }
3640+
3641+ if (CallerEdge->IsBackedge ) {
3642+ // We should only mark these if cloning recursive contexts, where we
3643+ // need to do this deferral.
3644+ assert (CloneRecursiveContexts);
3645+ DeferredBackedges++;
3646+ }
3647+
3648+ // If this is a backedge, we now do recursive cloning starting from its
3649+ // caller since we may have moved unambiguous caller contexts to a clone
3650+ // of this Node in a previous iteration of the current loop, giving more
3651+ // opportunity for cloning through the backedge. Because we sorted the
3652+ // caller edges earlier so that cold caller edges are first, we would have
3653+ // visited and cloned this node for any unamibiguously cold non-recursive
3654+ // callers before any ambiguous backedge callers. Note that we don't do this
3655+ // if the caller is already cloned or visited during cloning (e.g. via a
3656+ // different context path from the allocation).
3657+ // TODO: Can we do better in the case where the caller was already visited?
3658+ if (CallerEdge->IsBackedge && !CallerEdge->Caller ->CloneOf &&
3659+ !Visited.count (CallerEdge->Caller )) {
3660+ auto OrigIdCount = CallerEdge->getContextIds ().size ();
3661+ // Now do the recursive cloning of this backedge's caller, which was
3662+ // deferred earlier.
3663+ identifyClones (CallerEdge->Caller , Visited, CallerEdgeContextsForAlloc);
3664+ removeNoneTypeCalleeEdges (CallerEdge->Caller );
3665+ // See if the recursive call to identifyClones moved the context ids to a
3666+ // new edge from this node to a clone of caller, and switch to looking at
3667+ // that new edge so that we clone Node for the new caller clone.
3668+ bool UpdatedEdge = false ;
3669+ if (OrigIdCount > CallerEdge->getContextIds ().size ()) {
3670+ for (auto E : Node->CallerEdges ) {
3671+ // Only interested in clones of the current edges caller.
3672+ if (E->Caller ->CloneOf != CallerEdge->Caller )
3673+ continue ;
3674+ // See if this edge contains any of the context ids originally on the
3675+ // current caller edge.
3676+ auto CallerEdgeContextsForAllocNew =
3677+ set_intersection (CallerEdgeContextsForAlloc, E->getContextIds ());
3678+ if (CallerEdgeContextsForAllocNew.empty ())
3679+ continue ;
3680+ // Make sure we don't pick a previously existing caller edge of this
3681+ // Node, which would be processed on a different iteration of the
3682+ // outer loop over the saved CallerEdges.
3683+ if (std::find (CallerEdges.begin (), CallerEdges.end (), E) !=
3684+ CallerEdges.end ())
3685+ continue ;
3686+ // The CallerAllocTypeForAlloc and CalleeEdgeAllocTypesForCallerEdge
3687+ // are updated further below for all cases where we just invoked
3688+ // identifyClones recursively.
3689+ CallerEdgeContextsForAlloc.swap (CallerEdgeContextsForAllocNew);
3690+ CallerEdge = E;
3691+ UpdatedEdge = true ;
3692+ break ;
3693+ }
3694+ }
3695+ // If cloning removed this edge (and we didn't update it to a new edge
3696+ // above), we're done with this edge. It's possible we moved all of the
3697+ // context ids to an existing clone, in which case there's no need to do
3698+ // further processing for them.
3699+ if (CallerEdge->isRemoved ())
3700+ continue ;
3701+
3702+ // Now we need to update the information used for the cloning decisions
3703+ // further below, as we may have modified edges and their context ids.
3704+
3705+ // Note if we changed the CallerEdge above we would have already updated
3706+ // the context ids.
3707+ if (!UpdatedEdge) {
3708+ CallerEdgeContextsForAlloc = set_intersection (
3709+ CallerEdgeContextsForAlloc, CallerEdge->getContextIds ());
3710+ if (CallerEdgeContextsForAlloc.empty ())
3711+ continue ;
3712+ }
3713+ // Update the other information that depends on the edges and on the now
3714+ // updated CallerEdgeContextsForAlloc.
3715+ CallerAllocTypeForAlloc = computeAllocType (CallerEdgeContextsForAlloc);
3716+ CalleeEdgeAllocTypesForCallerEdge.clear ();
3717+ for (auto &CalleeEdge : Node->CalleeEdges )
3718+ CalleeEdgeAllocTypesForCallerEdge.push_back (intersectAllocTypes (
3719+ CalleeEdge->getContextIds (), CallerEdgeContextsForAlloc));
3720+ }
35563721
35573722 // First see if we can use an existing clone. Check each clone and its
35583723 // callee edges for matching alloc types.
0 commit comments