Skip to content

Commit 5a57d4e

Browse files
committed
[MemProf] Support cloning through recursive cycles
In order to facilitate cloning of recursive cycles, we first identify backedges using a standard DFS search from the root callers, then initially defer recursively invoking the cloning function via those edges. This is because the cloning opportunity along the backedge may not be exposed until the current node is cloned for other non-backedge callers that are cold after the earlier recursive cloning, resulting in a cold predecessor of the backedge. So we recursively invoke the cloning function for the backedges during the cloning of the current node for its caller edges (which were sorted to enable handling cold callers first). There was no significant time or memory overhead measured for several large applications.
1 parent 26fc2e9 commit 5a57d4e

File tree

3 files changed

+224
-45
lines changed

3 files changed

+224
-45
lines changed

llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp

Lines changed: 195 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,7 @@ STATISTIC(FoundProfiledCalleeMaxDepth,
8989
"Maximum depth of profiled callees found via tail calls");
9090
STATISTIC(FoundProfiledCalleeNonUniquelyCount,
9191
"Number of profiled callees found via multiple tail call chains");
92+
STATISTIC(DeferredBackedges, "Number of backedges with deferred cloning");
9293

9394
static cl::opt<std::string> DotFilePathPrefix(
9495
"memprof-dot-file-path-prefix", cl::init(""), cl::Hidden,
@@ -127,14 +128,18 @@ static cl::opt<bool> AllowRecursiveCallsites(
127128
"memprof-allow-recursive-callsites", cl::init(true), cl::Hidden,
128129
cl::desc("Allow cloning of callsites involved in recursive cycles"));
129130

131+
static cl::opt<bool> CloneRecursiveContexts(
132+
"memprof-clone-recursive-contexts", cl::init(true), cl::Hidden,
133+
cl::desc("Allow cloning of contexts through recursive cycles"));
134+
130135
// When disabled, try to detect and prevent cloning of recursive contexts.
131136
// This is only necessary until we support cloning through recursive cycles.
132137
// Leave on by default for now, as disabling requires a little bit of compile
133138
// time overhead and doesn't affect correctness, it will just inflate the cold
134139
// hinted bytes reporting a bit when -memprof-report-hinted-sizes is enabled.
135140
static cl::opt<bool> AllowRecursiveContexts(
136141
"memprof-allow-recursive-contexts", cl::init(true), cl::Hidden,
137-
cl::desc("Allow cloning of contexts through recursive cycles"));
142+
cl::desc("Allow cloning of contexts having recursive cycles"));
138143

139144
namespace llvm {
140145
cl::opt<bool> EnableMemProfContextDisambiguation(
@@ -293,37 +298,35 @@ class CallsiteContextGraph {
293298
// TODO: Should this be a map (from Caller node) for more efficient lookup?
294299
std::vector<std::shared_ptr<ContextEdge>> CallerEdges;
295300

296-
// Get the list of edges from which we can compute allocation information
297-
// such as the context ids and allocation type of this node.
298-
const std::vector<std::shared_ptr<ContextEdge>> *
299-
getEdgesWithAllocInfo() const {
300-
// If node has any callees, compute from those, otherwise compute from
301-
// callers (i.e. if this is the leaf allocation node).
302-
if (!CalleeEdges.empty())
303-
return &CalleeEdges;
301+
// Returns true if we need to look at the callee edges for determining the
302+
// node context ids and allocation type.
303+
bool useCallerEdgesForContextInfo() const {
304304
// Typically if the callee edges are empty either the caller edges are
305305
// also empty, or this is an allocation (leaf node). However, if we are
306306
// allowing recursive callsites and contexts this will be violated for
307307
// incompletely cloned recursive cycles.
308-
assert(CallerEdges.empty() || IsAllocation ||
308+
assert(!CalleeEdges.empty() || CallerEdges.empty() || IsAllocation ||
309309
(AllowRecursiveCallsites && AllowRecursiveContexts));
310-
if (!CallerEdges.empty() && IsAllocation)
311-
return &CallerEdges;
312-
return nullptr;
310+
// When cloning for a recursive context, during cloning we might be in the
311+
// midst of cloning for a recurrence and have moved context ids off of a
312+
// caller edge onto the clone but not yet off of the incoming caller
313+
// (back) edge. If we don't look at those we miss the fact that this node
314+
// still has context ids of interest.
315+
return IsAllocation || CloneRecursiveContexts;
313316
}
314317

315318
// Compute the context ids for this node from the union of its edge context
316319
// ids.
317320
DenseSet<uint32_t> getContextIds() const {
318-
DenseSet<uint32_t> ContextIds;
319-
auto *Edges = getEdgesWithAllocInfo();
320-
if (!Edges)
321-
return {};
322321
unsigned Count = 0;
323-
for (auto &Edge : *Edges)
322+
for (auto &Edge : CalleeEdges.empty() ? CallerEdges : CalleeEdges)
324323
Count += Edge->getContextIds().size();
324+
DenseSet<uint32_t> ContextIds;
325325
ContextIds.reserve(Count);
326-
for (auto &Edge : *Edges)
326+
std::vector<std::shared_ptr<ContextEdge>> Empty;
327+
auto Edges = llvm::concat<const std::shared_ptr<ContextEdge>>(
328+
CalleeEdges, useCallerEdgesForContextInfo() ? CallerEdges : Empty);
329+
for (const auto &Edge : Edges)
327330
ContextIds.insert(Edge->getContextIds().begin(),
328331
Edge->getContextIds().end());
329332
return ContextIds;
@@ -332,13 +335,13 @@ class CallsiteContextGraph {
332335
// Compute the allocation type for this node from the OR of its edge
333336
// allocation types.
334337
uint8_t computeAllocType() const {
335-
auto *Edges = getEdgesWithAllocInfo();
336-
if (!Edges)
337-
return (uint8_t)AllocationType::None;
338338
uint8_t BothTypes =
339339
(uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
340340
uint8_t AllocType = (uint8_t)AllocationType::None;
341-
for (auto &Edge : *Edges) {
341+
std::vector<std::shared_ptr<ContextEdge>> Empty;
342+
auto Edges = llvm::concat<const std::shared_ptr<ContextEdge>>(
343+
CalleeEdges, useCallerEdgesForContextInfo() ? CallerEdges : Empty);
344+
for (const auto &Edge : Edges) {
342345
AllocType |= Edge->AllocTypes;
343346
// Bail early if alloc type reached both, no further refinement.
344347
if (AllocType == BothTypes)
@@ -350,10 +353,10 @@ class CallsiteContextGraph {
350353
// The context ids set for this node is empty if its edge context ids are
351354
// also all empty.
352355
bool emptyContextIds() const {
353-
auto *Edges = getEdgesWithAllocInfo();
354-
if (!Edges)
355-
return true;
356-
for (auto &Edge : *Edges) {
356+
std::vector<std::shared_ptr<ContextEdge>> Empty;
357+
auto Edges = llvm::concat<const std::shared_ptr<ContextEdge>>(
358+
CalleeEdges, useCallerEdgesForContextInfo() ? CallerEdges : Empty);
359+
for (const auto &Edge : Edges) {
357360
if (!Edge->getContextIds().empty())
358361
return false;
359362
}
@@ -434,6 +437,14 @@ class CallsiteContextGraph {
434437
// for contexts including this edge.
435438
uint8_t AllocTypes = 0;
436439

440+
// Set just before initiating cloning when cloning of recursive contexts is
441+
// enabled. Used to defer cloning of backedges until we have done cloning of
442+
// the callee node for non-backedge caller edges. This exposes cloning
443+
// opportunities through the backedge of the cycle.
444+
// TODO: Note that this is not updated during cloning, and it is unclear
445+
// whether that would be needed.
446+
bool IsBackedge = false;
447+
437448
// The set of IDs for contexts including this edge.
438449
DenseSet<uint32_t> ContextIds;
439450

@@ -722,6 +733,9 @@ class CallsiteContextGraph {
722733
void moveCalleeEdgeToNewCaller(const std::shared_ptr<ContextEdge> &Edge,
723734
ContextNode *NewCaller);
724735

736+
void markBackedges(ContextNode *Node, DenseSet<const ContextNode *> &Visited,
737+
DenseSet<const ContextNode *> &CurrentStack);
738+
725739
/// Recursively perform cloning on the graph for the given Node and its
726740
/// callers, in order to uniquely identify the allocation behavior of an
727741
/// allocation given its context. The context ids of the allocation being
@@ -2874,6 +2888,7 @@ template <typename DerivedCCG, typename FuncTy, typename CallTy>
28742888
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::print(
28752889
raw_ostream &OS) const {
28762890
OS << "Edge from Callee " << Callee << " to Caller: " << Caller
2891+
<< (IsBackedge ? " (BE)" : "")
28772892
<< " AllocTypes: " << getAllocTypeString(AllocTypes);
28782893
OS << " ContextIds:";
28792894
std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
@@ -3115,6 +3130,8 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
31153130
// node (Edge's current callee may be the original node too).
31163131
assert(NewCallee->getOrigNode() == Edge->Callee->getOrigNode());
31173132

3133+
bool EdgeIsRecursive = Edge->Callee == Edge->Caller;
3134+
31183135
ContextNode *OldCallee = Edge->Callee;
31193136

31203137
// We might already have an edge to the new callee from earlier cloning for a
@@ -3181,8 +3198,16 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
31813198
// If this is a direct recursion edge, use NewCallee (the clone) as the
31823199
// callee as well, so that any edge updated/created here is also direct
31833200
// recursive.
3184-
if (CalleeToUse == OldCallee)
3201+
if (CalleeToUse == OldCallee) {
3202+
// If this is a recursive edge, see if we already moved a recursive edge
3203+
// (which would have to have been this one) - if we were only moving a
3204+
// subset of context ids it would still be on OldCallee.
3205+
if (EdgeIsRecursive) {
3206+
assert(OldCalleeEdge == Edge);
3207+
continue;
3208+
}
31853209
CalleeToUse = NewCallee;
3210+
}
31863211
// The context ids moving to the new callee are the subset of this edge's
31873212
// context ids and the context ids on the caller edge being moved.
31883213
DenseSet<uint32_t> EdgeContextIdsToMove =
@@ -3369,9 +3394,47 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
33693394
}
33703395
}
33713396

3397+
// This is the standard DFS based backedge discovery algorithm.
3398+
template <typename DerivedCCG, typename FuncTy, typename CallTy>
3399+
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::markBackedges(
3400+
ContextNode *Node, DenseSet<const ContextNode *> &Visited,
3401+
DenseSet<const ContextNode *> &CurrentStack) {
3402+
auto I = Visited.insert(Node);
3403+
// We should only call this for unvisited nodes.
3404+
assert(I.second);
3405+
for (auto &CalleeEdge : Node->CalleeEdges) {
3406+
auto *Callee = CalleeEdge->Callee;
3407+
if (Visited.count(Callee)) {
3408+
// Since this was already visited we need to check if it is currently on
3409+
// the recursive stack in which case it is a backedge.
3410+
if (CurrentStack.count(Callee))
3411+
CalleeEdge->IsBackedge = true;
3412+
continue;
3413+
}
3414+
CurrentStack.insert(Callee);
3415+
markBackedges(Callee, Visited, CurrentStack);
3416+
CurrentStack.erase(Callee);
3417+
}
3418+
}
3419+
33723420
template <typename DerivedCCG, typename FuncTy, typename CallTy>
33733421
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones() {
3422+
// If we are cloning recursive contexts, find and mark backedges from all root
3423+
// callers, using the typical DFS based backedge analysis.
33743424
DenseSet<const ContextNode *> Visited;
3425+
if (CloneRecursiveContexts) {
3426+
DenseSet<const ContextNode *> CurrentStack;
3427+
for (auto &Entry : NonAllocationCallToContextNodeMap) {
3428+
auto *Node = Entry.second;
3429+
if (Node->isRemoved())
3430+
continue;
3431+
// It is a root if it doesn't have callers.
3432+
if (!Node->CallerEdges.empty())
3433+
continue;
3434+
markBackedges(Node, Visited, CurrentStack);
3435+
assert(CurrentStack.empty());
3436+
}
3437+
}
33753438
for (auto &Entry : AllocationCallToContextNodeMap) {
33763439
Visited.clear();
33773440
identifyClones(Entry.second, Visited, Entry.second->getContextIds());
@@ -3430,6 +3493,14 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones(
34303493
assert(!is_contained(Node->CallerEdges, Edge));
34313494
continue;
34323495
}
3496+
// Defer backedges. See comments further below where these edges are
3497+
// handled during the cloning of this Node.
3498+
if (Edge->IsBackedge) {
3499+
// We should only mark these if cloning recursive contexts, where we
3500+
// need to do this deferral.
3501+
assert(CloneRecursiveContexts);
3502+
continue;
3503+
}
34333504
// Ignore any caller we previously visited via another edge.
34343505
if (!Visited.count(Edge->Caller) && !Edge->Caller->CloneOf) {
34353506
identifyClones(Edge->Caller, Visited, AllocContextIds);
@@ -3483,6 +3554,7 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones(
34833554
assert(Node->AllocTypes != (uint8_t)AllocationType::None);
34843555

34853556
DenseSet<uint32_t> RecursiveContextIds;
3557+
assert(AllowRecursiveContexts || !CloneRecursiveContexts);
34863558
// If we are allowing recursive callsites, but have also disabled recursive
34873559
// contexts, look for context ids that show up in multiple caller edges.
34883560
if (AllowRecursiveCallsites && !AllowRecursiveContexts) {
@@ -3505,6 +3577,13 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones(
35053577
// makes it less error-prone.
35063578
auto CallerEdges = Node->CallerEdges;
35073579
for (auto &CallerEdge : CallerEdges) {
3580+
// Skip any that have been removed by an earlier recursive call.
3581+
if (CallerEdge->isRemoved()) {
3582+
assert(!is_contained(Node->CallerEdges, CallerEdge));
3583+
continue;
3584+
}
3585+
assert(CallerEdge->Callee == Node);
3586+
35083587
// See if cloning the prior caller edge left this node with a single alloc
35093588
// type or a single caller. In that case no more cloning of Node is needed.
35103589
if (hasSingleAllocType(Node->AllocTypes) || Node->CallerEdges.size() <= 1)
@@ -3546,13 +3625,99 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones(
35463625
//
35473626
// Then check if by cloning node at least one of the callee edges will be
35483627
// disambiguated by splitting out different context ids.
3628+
//
3629+
// However, always do the cloning if this is a backedge, in which case we
3630+
// have not yet cloned along this caller edge.
35493631
assert(CallerEdge->AllocTypes != (uint8_t)AllocationType::None);
35503632
assert(Node->AllocTypes != (uint8_t)AllocationType::None);
3551-
if (allocTypeToUse(CallerAllocTypeForAlloc) ==
3633+
if (!CallerEdge->IsBackedge &&
3634+
allocTypeToUse(CallerAllocTypeForAlloc) ==
35523635
allocTypeToUse(Node->AllocTypes) &&
35533636
allocTypesMatch<DerivedCCG, FuncTy, CallTy>(
3554-
CalleeEdgeAllocTypesForCallerEdge, Node->CalleeEdges))
3637+
CalleeEdgeAllocTypesForCallerEdge, Node->CalleeEdges)) {
35553638
continue;
3639+
}
3640+
3641+
if (CallerEdge->IsBackedge) {
3642+
// We should only mark these if cloning recursive contexts, where we
3643+
// need to do this deferral.
3644+
assert(CloneRecursiveContexts);
3645+
DeferredBackedges++;
3646+
}
3647+
3648+
// If this is a backedge, we now do recursive cloning starting from its
3649+
// caller since we may have moved unambiguous caller contexts to a clone
3650+
// of this Node in a previous iteration of the current loop, giving more
3651+
// opportunity for cloning through the backedge. Because we sorted the
3652+
// caller edges earlier so that cold caller edges are first, we would have
3653+
// visited and cloned this node for any unamibiguously cold non-recursive
3654+
// callers before any ambiguous backedge callers. Note that we don't do this
3655+
// if the caller is already cloned or visited during cloning (e.g. via a
3656+
// different context path from the allocation).
3657+
// TODO: Can we do better in the case where the caller was already visited?
3658+
if (CallerEdge->IsBackedge && !CallerEdge->Caller->CloneOf &&
3659+
!Visited.count(CallerEdge->Caller)) {
3660+
auto OrigIdCount = CallerEdge->getContextIds().size();
3661+
// Now do the recursive cloning of this backedge's caller, which was
3662+
// deferred earlier.
3663+
identifyClones(CallerEdge->Caller, Visited, CallerEdgeContextsForAlloc);
3664+
removeNoneTypeCalleeEdges(CallerEdge->Caller);
3665+
// See if the recursive call to identifyClones moved the context ids to a
3666+
// new edge from this node to a clone of caller, and switch to looking at
3667+
// that new edge so that we clone Node for the new caller clone.
3668+
bool UpdatedEdge = false;
3669+
if (OrigIdCount > CallerEdge->getContextIds().size()) {
3670+
for (auto E : Node->CallerEdges) {
3671+
// Only interested in clones of the current edges caller.
3672+
if (E->Caller->CloneOf != CallerEdge->Caller)
3673+
continue;
3674+
// See if this edge contains any of the context ids originally on the
3675+
// current caller edge.
3676+
auto CallerEdgeContextsForAllocNew =
3677+
set_intersection(CallerEdgeContextsForAlloc, E->getContextIds());
3678+
if (CallerEdgeContextsForAllocNew.empty())
3679+
continue;
3680+
// Make sure we don't pick a previously existing caller edge of this
3681+
// Node, which would be processed on a different iteration of the
3682+
// outer loop over the saved CallerEdges.
3683+
if (std::find(CallerEdges.begin(), CallerEdges.end(), E) !=
3684+
CallerEdges.end())
3685+
continue;
3686+
// The CallerAllocTypeForAlloc and CalleeEdgeAllocTypesForCallerEdge
3687+
// are updated further below for all cases where we just invoked
3688+
// identifyClones recursively.
3689+
CallerEdgeContextsForAlloc.swap(CallerEdgeContextsForAllocNew);
3690+
CallerEdge = E;
3691+
UpdatedEdge = true;
3692+
break;
3693+
}
3694+
}
3695+
// If cloning removed this edge (and we didn't update it to a new edge
3696+
// above), we're done with this edge. It's possible we moved all of the
3697+
// context ids to an existing clone, in which case there's no need to do
3698+
// further processing for them.
3699+
if (CallerEdge->isRemoved())
3700+
continue;
3701+
3702+
// Now we need to update the information used for the cloning decisions
3703+
// further below, as we may have modified edges and their context ids.
3704+
3705+
// Note if we changed the CallerEdge above we would have already updated
3706+
// the context ids.
3707+
if (!UpdatedEdge) {
3708+
CallerEdgeContextsForAlloc = set_intersection(
3709+
CallerEdgeContextsForAlloc, CallerEdge->getContextIds());
3710+
if (CallerEdgeContextsForAlloc.empty())
3711+
continue;
3712+
}
3713+
// Update the other information that depends on the edges and on the now
3714+
// updated CallerEdgeContextsForAlloc.
3715+
CallerAllocTypeForAlloc = computeAllocType(CallerEdgeContextsForAlloc);
3716+
CalleeEdgeAllocTypesForCallerEdge.clear();
3717+
for (auto &CalleeEdge : Node->CalleeEdges)
3718+
CalleeEdgeAllocTypesForCallerEdge.push_back(intersectAllocTypes(
3719+
CalleeEdge->getContextIds(), CallerEdgeContextsForAlloc));
3720+
}
35563721

35573722
// First see if we can use an existing clone. Check each clone and its
35583723
// callee edges for matching alloc types.

0 commit comments

Comments
 (0)