@@ -89,6 +89,7 @@ STATISTIC(FoundProfiledCalleeMaxDepth,
8989 " Maximum depth of profiled callees found via tail calls" );
9090STATISTIC (FoundProfiledCalleeNonUniquelyCount,
9191 " Number of profiled callees found via multiple tail call chains" );
92+ STATISTIC (DeferredBackedges, " Number of backedges with deferred cloning" );
9293
9394static cl::opt<std::string> DotFilePathPrefix (
9495 " memprof-dot-file-path-prefix" , cl::init(" " ), cl::Hidden,
@@ -127,14 +128,18 @@ static cl::opt<bool> AllowRecursiveCallsites(
127128 " memprof-allow-recursive-callsites" , cl::init(true ), cl::Hidden,
128129 cl::desc(" Allow cloning of callsites involved in recursive cycles" ));
129130
131+ static cl::opt<bool > CloneRecursiveContexts (
132+ " memprof-clone-recursive-contexts" , cl::init(true ), cl::Hidden,
133+ cl::desc(" Allow cloning of contexts through recursive cycles" ));
134+
130135// When disabled, try to detect and prevent cloning of recursive contexts.
131136// This is only necessary until we support cloning through recursive cycles.
132137// Leave on by default for now, as disabling requires a little bit of compile
133138// time overhead and doesn't affect correctness, it will just inflate the cold
134139// hinted bytes reporting a bit when -memprof-report-hinted-sizes is enabled.
135140static cl::opt<bool > AllowRecursiveContexts (
136141 " memprof-allow-recursive-contexts" , cl::init(true ), cl::Hidden,
137- cl::desc(" Allow cloning of contexts through recursive cycles" ));
142+ cl::desc(" Allow cloning of contexts having recursive cycles" ));
138143
139144namespace llvm {
140145cl::opt<bool > EnableMemProfContextDisambiguation (
@@ -293,37 +298,40 @@ class CallsiteContextGraph {
293298 // TODO: Should this be a map (from Caller node) for more efficient lookup?
294299 std::vector<std::shared_ptr<ContextEdge>> CallerEdges;
295300
296- // Get the list of edges from which we can compute allocation information
297- // such as the context ids and allocation type of this node.
298- const std::vector<std::shared_ptr<ContextEdge>> *
299- getEdgesWithAllocInfo () const {
300- // If node has any callees, compute from those, otherwise compute from
301- // callers (i.e. if this is the leaf allocation node).
302- if (!CalleeEdges.empty ())
303- return &CalleeEdges;
301+ // Returns true if we need to look at the callee edges for determining the
302+ // node context ids and allocation type.
303+ bool useCallerEdgesForContextInfo () const {
304304 // Typically if the callee edges are empty either the caller edges are
305305 // also empty, or this is an allocation (leaf node). However, if we are
306306 // allowing recursive callsites and contexts this will be violated for
307307 // incompletely cloned recursive cycles.
308- assert (CallerEdges.empty () || IsAllocation ||
308+ assert (!CalleeEdges. empty () || CallerEdges.empty () || IsAllocation ||
309309 (AllowRecursiveCallsites && AllowRecursiveContexts));
310- if (!CallerEdges.empty () && IsAllocation)
311- return &CallerEdges;
312- return nullptr ;
310+ // When cloning for a recursive context, during cloning we might be in the
311+ // midst of cloning for a recurrence and have moved context ids off of a
312+ // caller edge onto the clone but not yet off of the incoming caller
313+ // (back) edge. If we don't look at those we miss the fact that this node
314+ // still has context ids of interest.
315+ return IsAllocation || CloneRecursiveContexts;
313316 }
314317
315318 // Compute the context ids for this node from the union of its edge context
316319 // ids.
317320 DenseSet<uint32_t > getContextIds () const {
318- DenseSet<uint32_t > ContextIds;
319- auto *Edges = getEdgesWithAllocInfo ();
320- if (!Edges)
321- return {};
322321 unsigned Count = 0 ;
323- for (auto &Edge : *Edges)
322+ // Compute the number of ids for reserve below. In general we only need to
323+ // look at one set of edges, typically the callee edges, since other than
324+ // allocations and in some cases during recursion cloning, all the context
325+ // ids on the callers should also flow out via callee edges.
326+ for (auto &Edge : CalleeEdges.empty () ? CallerEdges : CalleeEdges)
324327 Count += Edge->getContextIds ().size ();
328+ DenseSet<uint32_t > ContextIds;
325329 ContextIds.reserve (Count);
326- for (auto &Edge : *Edges)
330+ auto Edges = llvm::concat<const std::shared_ptr<ContextEdge>>(
331+ CalleeEdges, useCallerEdgesForContextInfo ()
332+ ? CallerEdges
333+ : std::vector<std::shared_ptr<ContextEdge>>());
334+ for (const auto &Edge : Edges)
327335 ContextIds.insert (Edge->getContextIds ().begin (),
328336 Edge->getContextIds ().end ());
329337 return ContextIds;
@@ -332,13 +340,14 @@ class CallsiteContextGraph {
332340 // Compute the allocation type for this node from the OR of its edge
333341 // allocation types.
334342 uint8_t computeAllocType () const {
335- auto *Edges = getEdgesWithAllocInfo ();
336- if (!Edges)
337- return (uint8_t )AllocationType::None;
338343 uint8_t BothTypes =
339344 (uint8_t )AllocationType::Cold | (uint8_t )AllocationType::NotCold;
340345 uint8_t AllocType = (uint8_t )AllocationType::None;
341- for (auto &Edge : *Edges) {
346+ auto Edges = llvm::concat<const std::shared_ptr<ContextEdge>>(
347+ CalleeEdges, useCallerEdgesForContextInfo ()
348+ ? CallerEdges
349+ : std::vector<std::shared_ptr<ContextEdge>>());
350+ for (const auto &Edge : Edges) {
342351 AllocType |= Edge->AllocTypes ;
343352 // Bail early if alloc type reached both, no further refinement.
344353 if (AllocType == BothTypes)
@@ -350,10 +359,11 @@ class CallsiteContextGraph {
350359 // The context ids set for this node is empty if its edge context ids are
351360 // also all empty.
352361 bool emptyContextIds () const {
353- auto *Edges = getEdgesWithAllocInfo ();
354- if (!Edges)
355- return true ;
356- for (auto &Edge : *Edges) {
362+ auto Edges = llvm::concat<const std::shared_ptr<ContextEdge>>(
363+ CalleeEdges, useCallerEdgesForContextInfo ()
364+ ? CallerEdges
365+ : std::vector<std::shared_ptr<ContextEdge>>());
366+ for (const auto &Edge : Edges) {
357367 if (!Edge->getContextIds ().empty ())
358368 return false ;
359369 }
@@ -434,6 +444,14 @@ class CallsiteContextGraph {
434444 // for contexts including this edge.
435445 uint8_t AllocTypes = 0 ;
436446
447+ // Set just before initiating cloning when cloning of recursive contexts is
448+ // enabled. Used to defer cloning of backedges until we have done cloning of
449+ // the callee node for non-backedge caller edges. This exposes cloning
450+ // opportunities through the backedge of the cycle.
451+ // TODO: Note that this is not updated during cloning, and it is unclear
452+ // whether that would be needed.
453+ bool IsBackedge = false ;
454+
437455 // The set of IDs for contexts including this edge.
438456 DenseSet<uint32_t > ContextIds;
439457
@@ -722,6 +740,9 @@ class CallsiteContextGraph {
722740 void moveCalleeEdgeToNewCaller (const std::shared_ptr<ContextEdge> &Edge,
723741 ContextNode *NewCaller);
724742
743+ void markBackedges (ContextNode *Node, DenseSet<const ContextNode *> &Visited,
744+ DenseSet<const ContextNode *> &CurrentStack);
745+
725746 // / Recursively perform cloning on the graph for the given Node and its
726747 // / callers, in order to uniquely identify the allocation behavior of an
727748 // / allocation given its context. The context ids of the allocation being
@@ -2874,6 +2895,7 @@ template <typename DerivedCCG, typename FuncTy, typename CallTy>
28742895void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::print(
28752896 raw_ostream &OS) const {
28762897 OS << " Edge from Callee " << Callee << " to Caller: " << Caller
2898+ << (IsBackedge ? " (BE)" : " " )
28772899 << " AllocTypes: " << getAllocTypeString (AllocTypes);
28782900 OS << " ContextIds:" ;
28792901 std::vector<uint32_t > SortedIds (ContextIds.begin (), ContextIds.end ());
@@ -3115,6 +3137,8 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
31153137 // node (Edge's current callee may be the original node too).
31163138 assert (NewCallee->getOrigNode () == Edge->Callee ->getOrigNode ());
31173139
3140+ bool EdgeIsRecursive = Edge->Callee == Edge->Caller ;
3141+
31183142 ContextNode *OldCallee = Edge->Callee ;
31193143
31203144 // We might already have an edge to the new callee from earlier cloning for a
@@ -3181,8 +3205,16 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
31813205 // If this is a direct recursion edge, use NewCallee (the clone) as the
31823206 // callee as well, so that any edge updated/created here is also direct
31833207 // recursive.
3184- if (CalleeToUse == OldCallee)
3208+ if (CalleeToUse == OldCallee) {
3209+ // If this is a recursive edge, see if we already moved a recursive edge
3210+ // (which would have to have been this one) - if we were only moving a
3211+ // subset of context ids it would still be on OldCallee.
3212+ if (EdgeIsRecursive) {
3213+ assert (OldCalleeEdge == Edge);
3214+ continue ;
3215+ }
31853216 CalleeToUse = NewCallee;
3217+ }
31863218 // The context ids moving to the new callee are the subset of this edge's
31873219 // context ids and the context ids on the caller edge being moved.
31883220 DenseSet<uint32_t > EdgeContextIdsToMove =
@@ -3369,9 +3401,47 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
33693401 }
33703402}
33713403
3404+ // This is the standard DFS based backedge discovery algorithm.
3405+ template <typename DerivedCCG, typename FuncTy, typename CallTy>
3406+ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::markBackedges(
3407+ ContextNode *Node, DenseSet<const ContextNode *> &Visited,
3408+ DenseSet<const ContextNode *> &CurrentStack) {
3409+ auto I = Visited.insert (Node);
3410+ // We should only call this for unvisited nodes.
3411+ assert (I.second );
3412+ for (auto &CalleeEdge : Node->CalleeEdges ) {
3413+ auto *Callee = CalleeEdge->Callee ;
3414+ if (Visited.count (Callee)) {
3415+ // Since this was already visited we need to check if it is currently on
3416+ // the recursive stack in which case it is a backedge.
3417+ if (CurrentStack.count (Callee))
3418+ CalleeEdge->IsBackedge = true ;
3419+ continue ;
3420+ }
3421+ CurrentStack.insert (Callee);
3422+ markBackedges (Callee, Visited, CurrentStack);
3423+ CurrentStack.erase (Callee);
3424+ }
3425+ }
3426+
33723427template <typename DerivedCCG, typename FuncTy, typename CallTy>
33733428void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones() {
3429+ // If we are cloning recursive contexts, find and mark backedges from all root
3430+ // callers, using the typical DFS based backedge analysis.
33743431 DenseSet<const ContextNode *> Visited;
3432+ if (CloneRecursiveContexts) {
3433+ DenseSet<const ContextNode *> CurrentStack;
3434+ for (auto &Entry : NonAllocationCallToContextNodeMap) {
3435+ auto *Node = Entry.second ;
3436+ if (Node->isRemoved ())
3437+ continue ;
3438+ // It is a root if it doesn't have callers.
3439+ if (!Node->CallerEdges .empty ())
3440+ continue ;
3441+ markBackedges (Node, Visited, CurrentStack);
3442+ assert (CurrentStack.empty ());
3443+ }
3444+ }
33753445 for (auto &Entry : AllocationCallToContextNodeMap) {
33763446 Visited.clear ();
33773447 identifyClones (Entry.second , Visited, Entry.second ->getContextIds ());
@@ -3430,6 +3500,14 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones(
34303500 assert (!is_contained (Node->CallerEdges , Edge));
34313501 continue ;
34323502 }
3503+ // Defer backedges. See comments further below where these edges are
3504+ // handled during the cloning of this Node.
3505+ if (Edge->IsBackedge ) {
3506+ // We should only mark these if cloning recursive contexts, where we
3507+ // need to do this deferral.
3508+ assert (CloneRecursiveContexts);
3509+ continue ;
3510+ }
34333511 // Ignore any caller we previously visited via another edge.
34343512 if (!Visited.count (Edge->Caller ) && !Edge->Caller ->CloneOf ) {
34353513 identifyClones (Edge->Caller , Visited, AllocContextIds);
@@ -3483,6 +3561,7 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones(
34833561 assert (Node->AllocTypes != (uint8_t )AllocationType::None);
34843562
34853563 DenseSet<uint32_t > RecursiveContextIds;
3564+ assert (AllowRecursiveContexts || !CloneRecursiveContexts);
34863565 // If we are allowing recursive callsites, but have also disabled recursive
34873566 // contexts, look for context ids that show up in multiple caller edges.
34883567 if (AllowRecursiveCallsites && !AllowRecursiveContexts) {
@@ -3505,6 +3584,13 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones(
35053584 // makes it less error-prone.
35063585 auto CallerEdges = Node->CallerEdges ;
35073586 for (auto &CallerEdge : CallerEdges) {
3587+ // Skip any that have been removed by an earlier recursive call.
3588+ if (CallerEdge->isRemoved ()) {
3589+ assert (!is_contained (Node->CallerEdges , CallerEdge));
3590+ continue ;
3591+ }
3592+ assert (CallerEdge->Callee == Node);
3593+
35083594 // See if cloning the prior caller edge left this node with a single alloc
35093595 // type or a single caller. In that case no more cloning of Node is needed.
35103596 if (hasSingleAllocType (Node->AllocTypes ) || Node->CallerEdges .size () <= 1 )
@@ -3546,13 +3632,100 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones(
35463632 //
35473633 // Then check if by cloning node at least one of the callee edges will be
35483634 // disambiguated by splitting out different context ids.
3635+ //
3636+ // However, always do the cloning if this is a backedge, in which case we
3637+ // have not yet cloned along this caller edge.
35493638 assert (CallerEdge->AllocTypes != (uint8_t )AllocationType::None);
35503639 assert (Node->AllocTypes != (uint8_t )AllocationType::None);
3551- if (allocTypeToUse (CallerAllocTypeForAlloc) ==
3640+ if (!CallerEdge->IsBackedge &&
3641+ allocTypeToUse (CallerAllocTypeForAlloc) ==
35523642 allocTypeToUse (Node->AllocTypes ) &&
35533643 allocTypesMatch<DerivedCCG, FuncTy, CallTy>(
3554- CalleeEdgeAllocTypesForCallerEdge, Node->CalleeEdges ))
3644+ CalleeEdgeAllocTypesForCallerEdge, Node->CalleeEdges )) {
35553645 continue ;
3646+ }
3647+
3648+ if (CallerEdge->IsBackedge ) {
3649+ // We should only mark these if cloning recursive contexts, where we
3650+ // need to do this deferral.
3651+ assert (CloneRecursiveContexts);
3652+ DeferredBackedges++;
3653+ }
3654+
3655+ // If this is a backedge, we now do recursive cloning starting from its
3656+ // caller since we may have moved unambiguous caller contexts to a clone
3657+ // of this Node in a previous iteration of the current loop, giving more
3658+ // opportunity for cloning through the backedge. Because we sorted the
3659+ // caller edges earlier so that cold caller edges are first, we would have
3660+ // visited and cloned this node for any unamibiguously cold non-recursive
3661+ // callers before any ambiguous backedge callers. Note that we don't do this
3662+ // if the caller is already cloned or visited during cloning (e.g. via a
3663+ // different context path from the allocation).
3664+ // TODO: Can we do better in the case where the caller was already visited?
3665+ if (CallerEdge->IsBackedge && !CallerEdge->Caller ->CloneOf &&
3666+ !Visited.count (CallerEdge->Caller )) {
3667+ const auto OrigIdCount = CallerEdge->getContextIds ().size ();
3668+ // Now do the recursive cloning of this backedge's caller, which was
3669+ // deferred earlier.
3670+ identifyClones (CallerEdge->Caller , Visited, CallerEdgeContextsForAlloc);
3671+ removeNoneTypeCalleeEdges (CallerEdge->Caller );
3672+ // See if the recursive call to identifyClones moved the context ids to a
3673+ // new edge from this node to a clone of caller, and switch to looking at
3674+ // that new edge so that we clone Node for the new caller clone.
3675+ bool UpdatedEdge = false ;
3676+ if (OrigIdCount > CallerEdge->getContextIds ().size ()) {
3677+ for (auto E : Node->CallerEdges ) {
3678+ // Only interested in clones of the current edges caller.
3679+ if (E->Caller ->CloneOf != CallerEdge->Caller )
3680+ continue ;
3681+ // See if this edge contains any of the context ids originally on the
3682+ // current caller edge.
3683+ auto CallerEdgeContextsForAllocNew =
3684+ set_intersection (CallerEdgeContextsForAlloc, E->getContextIds ());
3685+ if (CallerEdgeContextsForAllocNew.empty ())
3686+ continue ;
3687+ // Make sure we don't pick a previously existing caller edge of this
3688+ // Node, which would be processed on a different iteration of the
3689+ // outer loop over the saved CallerEdges.
3690+ if (std::find (CallerEdges.begin (), CallerEdges.end (), E) !=
3691+ CallerEdges.end ())
3692+ continue ;
3693+ // The CallerAllocTypeForAlloc and CalleeEdgeAllocTypesForCallerEdge
3694+ // are updated further below for all cases where we just invoked
3695+ // identifyClones recursively.
3696+ CallerEdgeContextsForAlloc.swap (CallerEdgeContextsForAllocNew);
3697+ CallerEdge = E;
3698+ UpdatedEdge = true ;
3699+ break ;
3700+ }
3701+ }
3702+ // If cloning removed this edge (and we didn't update it to a new edge
3703+ // above), we're done with this edge. It's possible we moved all of the
3704+ // context ids to an existing clone, in which case there's no need to do
3705+ // further processing for them.
3706+ if (CallerEdge->isRemoved ())
3707+ continue ;
3708+
3709+ // Now we need to update the information used for the cloning decisions
3710+ // further below, as we may have modified edges and their context ids.
3711+
3712+ // Note if we changed the CallerEdge above we would have already updated
3713+ // the context ids.
3714+ if (!UpdatedEdge) {
3715+ CallerEdgeContextsForAlloc = set_intersection (
3716+ CallerEdgeContextsForAlloc, CallerEdge->getContextIds ());
3717+ if (CallerEdgeContextsForAlloc.empty ())
3718+ continue ;
3719+ }
3720+ // Update the other information that depends on the edges and on the now
3721+ // updated CallerEdgeContextsForAlloc.
3722+ CallerAllocTypeForAlloc = computeAllocType (CallerEdgeContextsForAlloc);
3723+ CalleeEdgeAllocTypesForCallerEdge.clear ();
3724+ for (auto &CalleeEdge : Node->CalleeEdges ) {
3725+ CalleeEdgeAllocTypesForCallerEdge.push_back (intersectAllocTypes (
3726+ CalleeEdge->getContextIds (), CallerEdgeContextsForAlloc));
3727+ }
3728+ }
35563729
35573730 // First see if we can use an existing clone. Check each clone and its
35583731 // callee edges for matching alloc types.
0 commit comments