diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp index b78805b3fa62a..f6764be9b273e 100644 --- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp +++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp @@ -124,7 +124,7 @@ static cl::opt // Optionally enable cloning of callsites involved with recursive cycles static cl::opt AllowRecursiveCallsites( - "memprof-allow-recursive-callsites", cl::init(false), cl::Hidden, + "memprof-allow-recursive-callsites", cl::init(true), cl::Hidden, cl::desc("Allow cloning of callsites involved in recursive cycles")); // When disabled, try to detect and prevent cloning of recursive contexts. @@ -301,12 +301,14 @@ class CallsiteContextGraph { // callers (i.e. if this is the leaf allocation node). if (!CalleeEdges.empty()) return &CalleeEdges; - if (!CallerEdges.empty()) { - // A node with caller edges but no callee edges must be the allocation - // node. - assert(IsAllocation); + // Typically if the callee edges are empty either the caller edges are + // also empty, or this is an allocation (leaf node). However, if we are + // allowing recursive callsites and contexts this will be violated for + // incompletely cloned recursive cycles. + assert(CallerEdges.empty() || IsAllocation || + (AllowRecursiveCallsites && AllowRecursiveContexts)); + if (!CallerEdges.empty() && IsAllocation) return &CallerEdges; - } return nullptr; } @@ -403,8 +405,13 @@ class CallsiteContextGraph { // True if this node was effectively removed from the graph, in which case // it should have an allocation type of None and empty context ids. bool isRemoved() const { - assert((AllocTypes == (uint8_t)AllocationType::None) == - emptyContextIds()); + // Typically if the callee edges are empty either the caller edges are + // also empty, or this is an allocation (leaf node). However, if we are + // allowing recursive callsites and contexts this will be violated for + // incompletely cloned recursive cycles. + assert((AllowRecursiveCallsites && AllowRecursiveContexts) || + (AllocTypes == (uint8_t)AllocationType::None) == + emptyContextIds()); return AllocTypes == (uint8_t)AllocationType::None; } @@ -1344,16 +1351,48 @@ void CallsiteContextGraph::connectNewNode( DenseSet RemainingContextIds) { auto &OrigEdges = TowardsCallee ? OrigNode->CalleeEdges : OrigNode->CallerEdges; + DenseSet RecursiveContextIds; + DenseSet AllCallerContextIds; + if (AllowRecursiveCallsites) { + // Identify which context ids are recursive which is needed to properly + // update the RemainingContextIds set. The relevant recursive context ids + // are those that are in multiple edges. + for (auto &CE : OrigEdges) { + AllCallerContextIds.reserve(CE->getContextIds().size()); + for (auto Id : CE->getContextIds()) + if (!AllCallerContextIds.insert(Id).second) + RecursiveContextIds.insert(Id); + } + } // Increment iterator in loop so that we can remove edges as needed. for (auto EI = OrigEdges.begin(); EI != OrigEdges.end();) { auto Edge = *EI; + DenseSet NewEdgeContextIds; + DenseSet NotFoundContextIds; // Remove any matching context ids from Edge, return set that were found and // removed, these are the new edge's context ids. Also update the remaining // (not found ids). - DenseSet NewEdgeContextIds, NotFoundContextIds; set_subtract(Edge->getContextIds(), RemainingContextIds, NewEdgeContextIds, NotFoundContextIds); - RemainingContextIds.swap(NotFoundContextIds); + // Update the remaining context ids set for the later edges. This is a + // compile time optimization. + if (RecursiveContextIds.empty()) { + // No recursive ids, so all of the previously remaining context ids that + // were not seen on this edge are the new remaining set. + RemainingContextIds.swap(NotFoundContextIds); + } else { + // Keep the recursive ids in the remaining set as we expect to see those + // on another edge. We can remove the non-recursive remaining ids that + // were seen on this edge, however. We already have the set of remaining + // ids that were on this edge (in NewEdgeContextIds). Figure out which are + // non-recursive and only remove those. Note that despite the higher + // overhead of updating the remaining context ids set when recursion + // handling is enabled, it was found to be at worst performance neutral + // and in one case a clear win. + DenseSet NonRecursiveRemainingCurEdgeIds = + set_difference(NewEdgeContextIds, RecursiveContextIds); + set_subtract(RemainingContextIds, NonRecursiveRemainingCurEdgeIds); + } // If no matching context ids for this edge, skip it. if (NewEdgeContextIds.empty()) { ++EI; @@ -1410,9 +1449,9 @@ static void checkNode(const ContextNode *Node, set_union(CallerEdgeContextIds, Edge->ContextIds); } // Node can have more context ids than callers if some contexts terminate at - // node and some are longer. If we are allowing recursive callsites but - // haven't disabled recursive contexts, this will be violated for - // incompletely cloned recursive cycles, so skip the checking in that case. + // node and some are longer. If we are allowing recursive callsites and + // contexts this will be violated for incompletely cloned recursive cycles, + // so skip the checking in that case. assert((AllowRecursiveCallsites && AllowRecursiveContexts) || NodeContextIds == CallerEdgeContextIds || set_is_subset(CallerEdgeContextIds, NodeContextIds)); @@ -1425,7 +1464,11 @@ static void checkNode(const ContextNode *Node, checkEdge(Edge); set_union(CalleeEdgeContextIds, Edge->getContextIds()); } - assert(NodeContextIds == CalleeEdgeContextIds); + // If we are allowing recursive callsites and contexts this will be violated + // for incompletely cloned recursive cycles, so skip the checking in that + // case. + assert((AllowRecursiveCallsites && AllowRecursiveContexts) || + NodeContextIds == CalleeEdgeContextIds); } // FIXME: Since this checking is only invoked under an option, we should // change the error checking from using assert to something that will trigger @@ -3134,6 +3177,12 @@ void CallsiteContextGraph:: // over to the corresponding edge into the clone (which is created here if // this is a newly created clone). for (auto &OldCalleeEdge : OldCallee->CalleeEdges) { + ContextNode *CalleeToUse = OldCalleeEdge->Callee; + // If this is a direct recursion edge, use NewCallee (the clone) as the + // callee as well, so that any edge updated/created here is also direct + // recursive. + if (CalleeToUse == OldCallee) + CalleeToUse = NewCallee; // The context ids moving to the new callee are the subset of this edge's // context ids and the context ids on the caller edge being moved. DenseSet EdgeContextIdsToMove = @@ -3147,8 +3196,7 @@ void CallsiteContextGraph:: // clone, specifically during function assignment, where we would have // removed none type edges after creating the clone. If we can't find // a corresponding edge there, fall through to the cloning below. - if (auto *NewCalleeEdge = - NewCallee->findEdgeFromCallee(OldCalleeEdge->Callee)) { + if (auto *NewCalleeEdge = NewCallee->findEdgeFromCallee(CalleeToUse)) { NewCalleeEdge->getContextIds().insert(EdgeContextIdsToMove.begin(), EdgeContextIdsToMove.end()); NewCalleeEdge->AllocTypes |= computeAllocType(EdgeContextIdsToMove); @@ -3156,8 +3204,8 @@ void CallsiteContextGraph:: } } auto NewEdge = std::make_shared( - OldCalleeEdge->Callee, NewCallee, - computeAllocType(EdgeContextIdsToMove), EdgeContextIdsToMove); + CalleeToUse, NewCallee, computeAllocType(EdgeContextIdsToMove), + EdgeContextIdsToMove); NewCallee->CalleeEdges.push_back(NewEdge); NewEdge->Callee->CallerEdges.push_back(NewEdge); } @@ -3183,13 +3231,20 @@ template void CallsiteContextGraph:: moveCalleeEdgeToNewCaller(const std::shared_ptr &Edge, ContextNode *NewCaller) { + auto *OldCallee = Edge->Callee; + auto *NewCallee = OldCallee; + // If this edge was direct recursive, make any new/updated edge also direct + // recursive to NewCaller. + bool Recursive = Edge->Caller == Edge->Callee; + if (Recursive) + NewCallee = NewCaller; ContextNode *OldCaller = Edge->Caller; OldCaller->eraseCalleeEdge(Edge.get()); // We might already have an edge to the new caller. If one exists we will // reuse it. - auto ExistingEdgeToNewCaller = NewCaller->findEdgeFromCallee(Edge->Callee); + auto ExistingEdgeToNewCaller = NewCaller->findEdgeFromCallee(NewCallee); if (ExistingEdgeToNewCaller) { // Since we already have an edge to NewCaller, simply move the ids @@ -3199,11 +3254,19 @@ void CallsiteContextGraph:: ExistingEdgeToNewCaller->AllocTypes |= Edge->AllocTypes; Edge->ContextIds.clear(); Edge->AllocTypes = (uint8_t)AllocationType::None; - Edge->Callee->eraseCallerEdge(Edge.get()); + OldCallee->eraseCallerEdge(Edge.get()); } else { // Otherwise just reconnect Edge to NewCaller. Edge->Caller = NewCaller; NewCaller->CalleeEdges.push_back(Edge); + if (Recursive) { + assert(NewCallee == NewCaller); + // In the case of (direct) recursive edges, we update the callee as well + // so that it becomes recursive on the new caller. + Edge->Callee = NewCallee; + NewCallee->CallerEdges.push_back(Edge); + OldCallee->eraseCallerEdge(Edge.get()); + } // Don't need to update Edge's context ids since we are simply // reconnecting it. } @@ -3217,32 +3280,50 @@ void CallsiteContextGraph:: #ifndef NDEBUG bool IsNewNode = NewCaller->CallerEdges.empty(); #endif - for (auto &OldCallerEdge : OldCaller->CallerEdges) { - // The context ids moving to the new caller are the subset of this edge's - // context ids and the context ids on the callee edge being moved. - DenseSet EdgeContextIdsToMove = - set_intersection(OldCallerEdge->getContextIds(), Edge->getContextIds()); - set_subtract(OldCallerEdge->getContextIds(), EdgeContextIdsToMove); - OldCallerEdge->AllocTypes = - computeAllocType(OldCallerEdge->getContextIds()); - // In this function we expect that any pre-existing node already has edges - // from the same callers as the old node. That should be true in the current - // use case, where we will remove None-type edges after copying over all - // caller edges from the callee. - auto *ExistingCallerEdge = - NewCaller->findEdgeFromCaller(OldCallerEdge->Caller); - assert(IsNewNode || ExistingCallerEdge); - if (ExistingCallerEdge) { - ExistingCallerEdge->getContextIds().insert(EdgeContextIdsToMove.begin(), - EdgeContextIdsToMove.end()); - ExistingCallerEdge->AllocTypes |= computeAllocType(EdgeContextIdsToMove); - continue; + // If we just moved a direct recursive edge, presumably its context ids should + // also flow out of OldCaller via some other non-recursive callee edge. We + // don't want to remove the recursive context ids from other caller edges yet, + // otherwise the context ids get into an inconsistent state on OldCaller. + // We will update these context ids on the non-recursive caller edge when and + // if they are updated on the non-recursive callee. + if (!Recursive) { + for (auto &OldCallerEdge : OldCaller->CallerEdges) { + auto OldCallerCaller = OldCallerEdge->Caller; + // The context ids moving to the new caller are the subset of this edge's + // context ids and the context ids on the callee edge being moved. + DenseSet EdgeContextIdsToMove = set_intersection( + OldCallerEdge->getContextIds(), Edge->getContextIds()); + if (OldCaller == OldCallerCaller) { + OldCallerCaller = NewCaller; + // Don't actually move this one. The caller will move it directly via a + // call to this function with this as the Edge if it is appropriate to + // move to a diff node that has a matching callee (itself). + continue; + } + set_subtract(OldCallerEdge->getContextIds(), EdgeContextIdsToMove); + OldCallerEdge->AllocTypes = + computeAllocType(OldCallerEdge->getContextIds()); + // In this function we expect that any pre-existing node already has edges + // from the same callers as the old node. That should be true in the + // current use case, where we will remove None-type edges after copying + // over all caller edges from the callee. + auto *ExistingCallerEdge = NewCaller->findEdgeFromCaller(OldCallerCaller); + // Since we would have skipped caller edges when moving a direct recursive + // edge, this may not hold true when recursive handling enabled. + assert(IsNewNode || ExistingCallerEdge || AllowRecursiveCallsites); + if (ExistingCallerEdge) { + ExistingCallerEdge->getContextIds().insert(EdgeContextIdsToMove.begin(), + EdgeContextIdsToMove.end()); + ExistingCallerEdge->AllocTypes |= + computeAllocType(EdgeContextIdsToMove); + continue; + } + auto NewEdge = std::make_shared( + NewCaller, OldCallerCaller, computeAllocType(EdgeContextIdsToMove), + EdgeContextIdsToMove); + NewCaller->CallerEdges.push_back(NewEdge); + NewEdge->Caller->CalleeEdges.push_back(NewEdge); } - auto NewEdge = std::make_shared( - NewCaller, OldCallerEdge->Caller, - computeAllocType(EdgeContextIdsToMove), EdgeContextIdsToMove); - NewCaller->CallerEdges.push_back(NewEdge); - NewEdge->Caller->CalleeEdges.push_back(NewEdge); } // Recompute the node alloc type now that its caller edges have been // updated (since we will compute from those edges). @@ -3946,6 +4027,11 @@ bool CallsiteContextGraph::assignFunctions() { // handling and makes it less error-prone. auto CloneCallerEdges = Clone->CallerEdges; for (auto &Edge : CloneCallerEdges) { + // Skip removed edges (due to direct recursive edges updated when + // updating callee edges when moving an edge and subsequently + // removed by call to removeNoneTypeCalleeEdges on the Clone). + if (Edge->isRemoved()) + continue; // Ignore any caller that does not have a recorded callsite Call. if (!Edge->Caller->hasCall()) continue; diff --git a/llvm/test/ThinLTO/X86/memprof-icp-recursive.ll b/llvm/test/ThinLTO/X86/memprof-icp-recursive.ll new file mode 100644 index 0000000000000..f8dcd80d4e141 --- /dev/null +++ b/llvm/test/ThinLTO/X86/memprof-icp-recursive.ll @@ -0,0 +1,260 @@ +;; This test case is the same as memprof-icp.ll but manually modified to +;; add recursive cycles into the allocation contexts, and it has +;; -memprof-allow-recursive-callsites enabled to test that handling. +;; The call contexts are somewhat nonsensical, but designed to provoke +;; several scenarios that were seen in large applications and caused +;; failures until they were addressed. In particular, converting mutual +;; recursion to direct recursion when stack nodes are matched onto inlined +;; callsites; edge partitioning due to multiple calls with the same debug +;; info (here because it is an indirect call) requiring updating/moving +;; of recursive contexts; and the necessary updates of recursive contexts +;; and edges during cloning. + +;; The checks are however not modified from the original memprof-icp.ll test: +;; when basic recursive callsite handling is enabled (which currently doesn't +;; clone through recursive cycles but is designed to at least allow cloning +;; across them for the the non-recursive call sequence of the context) we +;; should successfully get the same cloning as without any recursion. + +;; -stats requires asserts +; REQUIRES: asserts + +; RUN: split-file %s %t + +;; For now explicitly turn on this handling, which is off by default. +; RUN: opt -thinlto-bc %t/main.ll -enable-memprof-indirect-call-support=true >%t/main.o +; RUN: opt -thinlto-bc %t/foo.ll -enable-memprof-indirect-call-support=true >%t/foo.o + +;; First perform in-process ThinLTO +; RUN: llvm-lto2 run %t/main.o %t/foo.o -enable-memprof-context-disambiguation \ +; RUN: -enable-memprof-indirect-call-support=true \ +; RUN: -memprof-allow-recursive-callsites \ +; RUN: -supports-hot-cold-new \ +; RUN: -r=%t/foo.o,_Z3fooR2B0j,plx \ +; RUN: -r=%t/foo.o,_ZN2B03barEj, \ +; RUN: -r=%t/foo.o,_ZN1B3barEj, \ +; RUN: -r=%t/main.o,_Z3fooR2B0j, \ +; RUN: -r=%t/main.o,_Znwm, \ +; RUN: -r=%t/main.o,_ZdlPvm, \ +; RUN: -r=%t/main.o,_Z8externalPi, \ +; RUN: -r=%t/main.o,main,plx \ +; RUN: -r=%t/main.o,_ZN2B03barEj,plx \ +; RUN: -r=%t/main.o,_ZN1B3barEj,plx \ +; RUN: -r=%t/main.o,_ZTV1B,plx \ +; RUN: -r=%t/main.o,_ZTVN10__cxxabiv120__si_class_type_infoE,plx \ +; RUN: -r=%t/main.o,_ZTS1B,plx \ +; RUN: -r=%t/main.o,_ZTVN10__cxxabiv117__class_type_infoE,plx \ +; RUN: -r=%t/main.o,_ZTS2B0,plx \ +; RUN: -r=%t/main.o,_ZTI2B0,plx \ +; RUN: -r=%t/main.o,_ZTI1B,plx \ +; RUN: -r=%t/main.o,_ZTV2B0,plx \ +; RUN: -thinlto-threads=1 \ +; RUN: -memprof-verify-ccg -memprof-verify-nodes -stats \ +; RUN: -pass-remarks=. -save-temps \ +; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=STATS \ +; RUN: --check-prefix=REMARKS + +; RUN: llvm-dis %t.out.2.4.opt.bc -o - | FileCheck %s --check-prefix=IR + +; REMARKS: call in clone main assigned to call function clone _Z3fooR2B0j.memprof.1 +; REMARKS: call in clone main assigned to call function clone _Z3fooR2B0j.memprof.1 +; REMARKS: created clone _ZN2B03barEj.memprof.1 +; REMARKS: call in clone _ZN2B03barEj marked with memprof allocation attribute notcold +; REMARKS: call in clone _ZN2B03barEj.memprof.1 marked with memprof allocation attribute cold +; REMARKS: call in clone _ZN2B03barEj marked with memprof allocation attribute notcold +; REMARKS: call in clone _ZN2B03barEj.memprof.1 marked with memprof allocation attribute cold +; REMARKS: created clone _ZN1B3barEj.memprof.1 +; REMARKS: call in clone _ZN1B3barEj marked with memprof allocation attribute notcold +; REMARKS: call in clone _ZN1B3barEj.memprof.1 marked with memprof allocation attribute cold +; REMARKS: call in clone _ZN1B3barEj marked with memprof allocation attribute notcold +; REMARKS: call in clone _ZN1B3barEj.memprof.1 marked with memprof allocation attribute cold +; REMARKS: created clone _Z3fooR2B0j.memprof.1 +;; In each version of foo we should have promoted the indirect call to two conditional +;; direct calls, one to B::bar and one to B0::bar. The cloned version of foo should call +;; the cloned versions of bar for both promotions. +; REMARKS: Promote indirect call to _ZN1B3barEj with count 2 out of 4 +; REMARKS: call in clone _Z3fooR2B0j promoted and assigned to call function clone _ZN1B3barEj +; REMARKS: Promote indirect call to _ZN1B3barEj with count 2 out of 4 +; REMARKS: call in clone _Z3fooR2B0j.memprof.1 promoted and assigned to call function clone _ZN1B3barEj.memprof.1 +; REMARKS: Promote indirect call to _ZN2B03barEj with count 2 out of 2 +; REMARKS: call in clone _Z3fooR2B0j promoted and assigned to call function clone _ZN2B03barEj +; REMARKS: Promote indirect call to _ZN2B03barEj with count 2 out of 2 +; REMARKS: call in clone _Z3fooR2B0j.memprof.1 promoted and assigned to call function clone _ZN2B03barEj.memprof.1 +; REMARKS: created clone _ZN2B03barEj.memprof.1 +; REMARKS: call in clone _ZN2B03barEj marked with memprof allocation attribute notcold +; REMARKS: call in clone _ZN2B03barEj.memprof.1 marked with memprof allocation attribute cold +; REMARKS: created clone _ZN1B3barEj.memprof.1 +; REMARKS: call in clone _ZN1B3barEj marked with memprof allocation attribute notcold +; REMARKS: call in clone _ZN1B3barEj.memprof.1 marked with memprof allocation attribute cold + +; STATS: 4 memprof-context-disambiguation - Number of cold static allocations (possibly cloned) during whole program analysis +; STATS: 8 memprof-context-disambiguation - Number of cold static allocations (possibly cloned) during ThinLTO backend +; STATS: 4 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned) during whole program analysis +; STATS: 8 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned) during ThinLTO backend +; STATS: 3 memprof-context-disambiguation - Number of function clones created during whole program analysis +; STATS: 5 memprof-context-disambiguation - Number of function clones created during ThinLTO backend + +; IR: define {{.*}} @_Z3fooR2B0j( +; IR: %[[R1:[0-9]+]] = icmp eq ptr %0, @_ZN1B3barEj +; IR: br i1 %[[R1]], label %if.true.direct_targ, label %if.false.orig_indirect +; IR: if.true.direct_targ: +; IR: call {{.*}} @_Znwm(i64 noundef 4) #[[NOTCOLD:[0-9]+]] +; IR: if.false.orig_indirect: +; IR: %[[R2:[0-9]+]] = icmp eq ptr %0, @_ZN2B03barEj +; IR: br i1 %[[R2]], label %if.true.direct_targ1, label %if.false.orig_indirect2 +; IR: if.true.direct_targ1: +; IR: call {{.*}} @_Znwm(i64 noundef 4) #[[NOTCOLD]] +; IR: if.false.orig_indirect2: +; IR: call {{.*}} %0 + +; IR: define {{.*}} @_Z3fooR2B0j.memprof.1( +;; We should still compare against the original versions of bar since that is +;; what is in the vtable. However, we should have called the cloned versions +;; that perform cold allocations, which were subsequently inlined. +; IR: %[[R3:[0-9]+]] = icmp eq ptr %0, @_ZN1B3barEj +; IR: br i1 %[[R3]], label %if.true.direct_targ, label %if.false.orig_indirect +; IR: if.true.direct_targ: +; IR: call {{.*}} @_Znwm(i64 noundef 4) #[[COLD:[0-9]+]] +; IR: if.false.orig_indirect: +; IR: %[[R4:[0-9]+]] = icmp eq ptr %0, @_ZN2B03barEj +; IR: br i1 %[[R4]], label %if.true.direct_targ1, label %if.false.orig_indirect2 +; IR: if.true.direct_targ1: +; IR: call {{.*}} @_Znwm(i64 noundef 4) #[[COLD]] +; IR: if.false.orig_indirect2: +; IR: call {{.*}} %0 + +; IR: attributes #[[NOTCOLD]] = {{.*}} "memprof"="notcold" +; IR: attributes #[[COLD]] = {{.*}} "memprof"="cold" + +;--- foo.ll +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +declare i32 @_ZN2B03barEj(ptr %this, i32 %s) +declare i32 @_ZN1B3barEj(ptr %this, i32 %s) + +define i32 @_Z3fooR2B0j(ptr %b) { +entry: + %0 = load ptr, ptr %b, align 8 + %call = tail call i32 %0(ptr null, i32 0), !prof !0, !callsite !1 + ret i32 0 +} + +!0 = !{!"VP", i32 0, i64 4, i64 4445083295448962937, i64 2, i64 -2718743882639408571, i64 2} +!1 = !{i64 -2101080423462424381, i64 3456} + +;--- main.ll +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@_ZTV1B = external constant { [3 x ptr] } +@_ZTVN10__cxxabiv120__si_class_type_infoE = external global [0 x ptr] +@_ZTS1B = external constant [3 x i8] +@_ZTVN10__cxxabiv117__class_type_infoE = external global [0 x ptr] +@_ZTS2B0 = external constant [4 x i8] +@_ZTI2B0 = external constant { ptr, ptr } +@_ZTI1B = external constant { ptr, ptr, ptr } +@_ZTV2B0 = external constant { [3 x ptr] } + +define i32 @main() !prof !29 { +entry: + %call2 = call i32 @_Z3fooR2B0j(ptr null, i32 0), !callsite !30 + %call4 = call i32 @_Z3fooR2B0j(ptr null, i32 0), !callsite !31 + %call6 = call i32 @_Z3fooR2B0j(ptr null, i32 0), !callsite !32 + %call8 = call i32 @_Z3fooR2B0j(ptr null, i32 0), !callsite !57 + ret i32 0 +} + +declare i32 @_Z3fooR2B0j(ptr, i32) + +define i32 @_ZN2B03barEj(ptr %this, i32 %s) { +entry: + %call = tail call ptr @_Znwm(i64 noundef 4) #0, !memprof !33, !callsite !38 + ;; Second allocation in this function, to ensure that indirect edges to the + ;; same callee are partitioned correctly. + %call2 = tail call ptr @_Znwm(i64 noundef 4) #0, !memprof !45, !callsite !50 + store volatile i32 0, ptr %call, align 4 + ret i32 0 +} + +declare ptr @_Znwm(i64) + +declare void @_Z8externalPi() + +declare void @_ZdlPvm() + +define i32 @_ZN1B3barEj(ptr %this, i32 %s) { +entry: + %call = tail call ptr @_Znwm(i64 noundef 4) #0, !memprof !39, !callsite !44 + ;; Second allocation in this function, to ensure that indirect edges to the + ;; same callee are partitioned correctly. + %call2 = tail call ptr @_Znwm(i64 noundef 4) #0, !memprof !51, !callsite !56 + store volatile i32 0, ptr %call, align 4 + ret i32 0 +} + +; uselistorder directives +uselistorder ptr @_Z3fooR2B0j, { 3, 2, 1, 0 } + +attributes #0 = { builtin allocsize(0) } + +!llvm.module.flags = !{!0} + +!0 = !{i32 1, !"ProfileSummary", !1} +!1 = !{!2, !3, !4, !5, !6, !7, !8, !9, !10, !11} +!2 = !{!"ProfileFormat", !"InstrProf"} +!3 = !{!"TotalCount", i64 13} +!4 = !{!"MaxCount", i64 4} +!5 = !{!"MaxInternalCount", i64 0} +!6 = !{!"MaxFunctionCount", i64 4} +!7 = !{!"NumCounts", i64 5} +!8 = !{!"NumFunctions", i64 5} +!9 = !{!"IsPartialProfile", i64 0} +!10 = !{!"PartialProfileRatio", double 0.000000e+00} +!11 = !{!"DetailedSummary", !12} +!12 = !{!13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !28} +!13 = !{i32 10000, i64 0, i32 0} +!14 = !{i32 100000, i64 4, i32 2} +!15 = !{i32 200000, i64 4, i32 2} +!16 = !{i32 300000, i64 4, i32 2} +!17 = !{i32 400000, i64 4, i32 2} +!18 = !{i32 500000, i64 4, i32 2} +!19 = !{i32 600000, i64 4, i32 2} +!20 = !{i32 700000, i64 2, i32 4} +!21 = !{i32 800000, i64 2, i32 4} +!22 = !{i32 900000, i64 2, i32 4} +!23 = !{i32 950000, i64 2, i32 4} +!24 = !{i32 990000, i64 2, i32 4} +!25 = !{i32 999000, i64 2, i32 4} +!26 = !{i32 999900, i64 2, i32 4} +!27 = !{i32 999990, i64 2, i32 4} +!28 = !{i32 999999, i64 2, i32 4} +!29 = !{!"function_entry_count", i64 1} +!30 = !{i64 -6490791336773930154} +!31 = !{i64 5188446645037944434} +!32 = !{i64 5583420417449503557} +!57 = !{i64 132626519179914298} +!33 = !{!34, !36} +!34 = !{!35, !"notcold"} +!35 = !{i64 -852997907418798798, i64 -2101080423462424381, i64 3456, i64 -2101080423462424381, i64 3456, i64 5188446645037944434} +!36 = !{!37, !"cold"} +!37 = !{i64 -852997907418798798, i64 -2101080423462424381, i64 3456, i64 -2101080423462424381, i64 3456, i64 5583420417449503557} +!38 = !{i64 -852997907418798798} +!39 = !{!40, !42} +!40 = !{!41, !"notcold"} +!41 = !{i64 4457553070050523782, i64 -2101080423462424381, i64 3456, i64 -2101080423462424381, i64 3456, i64 132626519179914298} +!42 = !{!43, !"cold"} +!43 = !{i64 4457553070050523782, i64 -2101080423462424381, i64 3456, i64 -2101080423462424381, i64 3456, i64 -6490791336773930154} +!44 = !{i64 4457553070050523782} +!45 = !{!46, !48} +!46 = !{!47, !"notcold"} +!47 = !{i64 456, i64 -2101080423462424381, i64 3456, i64 5188446645037944434} +!48 = !{!49, !"cold"} +!49 = !{i64 456, i64 -2101080423462424381, i64 3456, i64 5583420417449503557} +!50 = !{i64 456} +!51 = !{!52, !54} +!52 = !{!53, !"notcold"} +!53 = !{i64 789, i64 -2101080423462424381, i64 3456, i64 132626519179914298} +!54 = !{!55, !"cold"} +!55 = !{i64 789, i64 -2101080423462424381, i64 3456, i64 -6490791336773930154} +!56 = !{i64 789} diff --git a/llvm/test/ThinLTO/X86/memprof-recursive.ll b/llvm/test/ThinLTO/X86/memprof-recursive.ll index 1f4f75460b070..5be27c3e380fc 100644 --- a/llvm/test/ThinLTO/X86/memprof-recursive.ll +++ b/llvm/test/ThinLTO/X86/memprof-recursive.ll @@ -39,7 +39,7 @@ ; RUN: --implicit-check-not="created clone" \ ; RUN: --implicit-check-not="marked with memprof allocation attribute cold" -;; Check the default behavior (disabled recursive callsites). +;; Check the default behavior (enabled recursive callsites). ; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \ ; RUN: -supports-hot-cold-new \ ; RUN: -r=%t.o,_Z1Dv,plx \ @@ -49,10 +49,9 @@ ; RUN: -r=%t.o,_Znam, \ ; RUN: -memprof-verify-ccg -memprof-verify-nodes \ ; RUN: -pass-remarks=memprof-context-disambiguation \ -; RUN: -o %t.out 2>&1 | FileCheck %s --allow-empty \ +; RUN: -o %t.out 2>&1 | FileCheck %s \ ; RUN: --implicit-check-not "memprof_recursive3.cc:12:10: call in clone _Z1Ci.memprof.1 assigned" \ -; RUN: --implicit-check-not="created clone" \ -; RUN: --implicit-check-not="marked with memprof allocation attribute cold" +; RUN: --check-prefix=ALLOW-RECUR-CALLSITES --check-prefix=ALLOW-RECUR-CONTEXTS ;; Skipping recursive contexts should prevent spurious call to cloned version of ;; B from the context starting at memprof_recursive.cc:19:13, which is actually diff --git a/llvm/test/Transforms/MemProfContextDisambiguation/recursive.ll b/llvm/test/Transforms/MemProfContextDisambiguation/recursive.ll index fd87cb535c42e..1a8020540f3a8 100644 --- a/llvm/test/Transforms/MemProfContextDisambiguation/recursive.ll +++ b/llvm/test/Transforms/MemProfContextDisambiguation/recursive.ll @@ -57,15 +57,13 @@ ; RUN: --implicit-check-not="marked with memprof allocation attribute cold" \ ; RUN: --check-prefix=ALL -;; Check the default behavior (disabled recursive callsites). +;; Check the default behavior (enabled recursive callsites). ; RUN: opt -passes=memprof-context-disambiguation -supports-hot-cold-new \ ; RUN: -memprof-verify-ccg -memprof-verify-nodes \ ; RUN: -pass-remarks=memprof-context-disambiguation \ ; RUN: %s -S 2>&1 | FileCheck %s \ ; RUN: --implicit-check-not "memprof_recursive3.cc:12:10: call in clone _Z1Ci.memprof.1 assigned" \ -; RUN: --implicit-check-not="created clone" \ -; RUN: --implicit-check-not="marked with memprof allocation attribute cold" \ -; RUN: --check-prefix=ALL +; RUN: --check-prefix=ALL --check-prefix=ALLOW-RECUR-CALLSITES --check-prefix=ALLOW-RECUR-CONTEXTS ;; Skipping recursive contexts should prevent spurious call to cloned version of ;; B from the context starting at memprof_recursive.cc:19:13, which is actually