From f6ec3fa665e39ecf6fe554509b5caf0727654d41 Mon Sep 17 00:00:00 2001 From: Teresa Johnson Date: Fri, 25 Jul 2025 19:38:30 -0700 Subject: [PATCH 1/3] [MemProf] Ensure all callsite clones are assigned a function clone Fix a bug in function assignment where we were not assigning all callsite clones to a function clone. This led to incorrect call updates because multiple callsite clones could look like they were assigned to the same function clone. Add in a stat and debug message to help identify and debug cases where this is still happening. --- .../IPO/MemProfContextDisambiguation.cpp | 89 +++++++++-- .../ThinLTO/X86/memprof_func_assign_fix.ll | 145 ++++++++++++++++++ .../func_assign_fix.ll | 130 ++++++++++++++++ 3 files changed, 352 insertions(+), 12 deletions(-) create mode 100644 llvm/test/ThinLTO/X86/memprof_func_assign_fix.ll create mode 100644 llvm/test/Transforms/MemProfContextDisambiguation/func_assign_fix.ll diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp index b803c97a7bd99..e94bdd5d5e555 100644 --- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp +++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp @@ -97,6 +97,8 @@ STATISTIC(MissingAllocForContextId, "Number of missing alloc nodes for context ids"); STATISTIC(SkippedCallsCloning, "Number of calls skipped during cloning due to unexpected operand"); +STATISTIC(MismatchedCloneAssignments, + "Number of callsites assigned to call multiple non-matching clones"); static cl::opt DotFilePathPrefix( "memprof-dot-file-path-prefix", cl::init(""), cl::Hidden, @@ -2060,6 +2062,18 @@ static bool isMemProfClone(const Function &F) { return F.getName().contains(MemProfCloneSuffix); } +// Return the clone number of the given function by extracting it from the +// memprof suffix. Assumes the caller has already confirmed it is a memprof +// clone. +static unsigned getMemProfCloneNum(const Function &F) { + assert(isMemProfClone(F)); + auto Pos = F.getName().find_last_of('.'); + assert(Pos > 0); + unsigned CloneNo; + F.getName().drop_front(Pos + 1).getAsInteger(10, CloneNo); + return CloneNo; +} + std::string ModuleCallsiteContextGraph::getLabel(const Function *Func, const Instruction *Call, unsigned CloneNo) const { @@ -3979,7 +3993,22 @@ IndexCallsiteContextGraph::getAllocationCallType(const CallInfo &Call) const { void ModuleCallsiteContextGraph::updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc) { - if (CalleeFunc.cloneNo() > 0) + auto *CurF = cast(CallerCall.call())->getCalledFunction(); + auto NewCalleeCloneNo = CalleeFunc.cloneNo(); + if (isMemProfClone(*CurF)) { + // If we already assigned this callsite to call a specific non-default + // clone (i.e. not the original function which is clone 0), ensure that we + // aren't trying to now update it to call a different clone, which is + // indicative of a bug in the graph or function assignment. + auto CurCalleeCloneNo = getMemProfCloneNum(*CurF); + if (CurCalleeCloneNo != NewCalleeCloneNo) { + LLVM_DEBUG(dbgs() << "Mismatch in call clone assignment: was " + << CurCalleeCloneNo << " now " << NewCalleeCloneNo + << "\n"); + MismatchedCloneAssignments++; + } + } + if (NewCalleeCloneNo > 0) cast(CallerCall.call())->setCalledFunction(CalleeFunc.func()); OREGetter(CallerCall.call()->getFunction()) .emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CallerCall.call()) @@ -3995,7 +4024,19 @@ void IndexCallsiteContextGraph::updateCall(CallInfo &CallerCall, assert(CI && "Caller cannot be an allocation which should not have profiled calls"); assert(CI->Clones.size() > CallerCall.cloneNo()); - CI->Clones[CallerCall.cloneNo()] = CalleeFunc.cloneNo(); + auto NewCalleeCloneNo = CalleeFunc.cloneNo(); + auto CurCalleeCloneNo = CI->Clones[CallerCall.cloneNo()]; + // If we already assigned this callsite to call a specific non-default + // clone (i.e. not the original function which is clone 0), ensure that we + // aren't trying to now update it to call a different clone, which is + // indicative of a bug in the graph or function assignment. + if (CurCalleeCloneNo != 0 && CurCalleeCloneNo != NewCalleeCloneNo) { + LLVM_DEBUG(dbgs() << "Mismatch in call clone assignment: was " + << CurCalleeCloneNo << " now " << NewCalleeCloneNo + << "\n"); + MismatchedCloneAssignments++; + } + CI->Clones[CallerCall.cloneNo()] = NewCalleeCloneNo; } // Update the debug information attached to NewFunc to use the clone Name. Note @@ -4703,6 +4744,18 @@ bool CallsiteContextGraph::assignFunctions() { // where the callers were assigned to different clones of a function. } + auto FindFirstAvailFuncClone = [&]() { + // Find first function in FuncClonesToCallMap without an assigned + // clone of this callsite Node. We should always have one + // available at this point due to the earlier cloning when the + // FuncClonesToCallMap size was smaller than the clone number. + for (auto &CF : FuncClonesToCallMap) { + if (!FuncCloneToCurNodeCloneMap.count(CF.first)) + return CF.first; + } + assert(false); + }; + // See if we can use existing function clone. Walk through // all caller edges to see if any have already been assigned to // a clone of this callsite's function. If we can use it, do so. If not, @@ -4819,16 +4872,7 @@ bool CallsiteContextGraph::assignFunctions() { // clone of OrigFunc for another caller during this iteration over // its caller edges. if (!FuncCloneAssignedToCurCallsiteClone) { - // Find first function in FuncClonesToCallMap without an assigned - // clone of this callsite Node. We should always have one - // available at this point due to the earlier cloning when the - // FuncClonesToCallMap size was smaller than the clone number. - for (auto &CF : FuncClonesToCallMap) { - if (!FuncCloneToCurNodeCloneMap.count(CF.first)) { - FuncCloneAssignedToCurCallsiteClone = CF.first; - break; - } - } + FuncCloneAssignedToCurCallsiteClone = FindFirstAvailFuncClone(); assert(FuncCloneAssignedToCurCallsiteClone); // Assign Clone to FuncCloneAssignedToCurCallsiteClone AssignCallsiteCloneToFuncClone( @@ -4842,6 +4886,27 @@ bool CallsiteContextGraph::assignFunctions() { FuncCloneAssignedToCurCallsiteClone); } } + // If we didn't assign a function clone to this callsite clone yet, e.g. + // none of its callers has a non-null call, do the assignment here. + // We want to ensure that every callsite clone is assigned to some + // function clone, so that the call updates below work as expected. + // In particular if this is the original callsite, we want to ensure it + // is assigned to the original function, otherwise the original function + // will appear available for assignment to other callsite clones, + // leading to unintended effects. For one, the unknown and not updated + // callers will call into cloned paths leading to the wrong hints, + // because they still call the original function (clone 0). Also, + // because all callsites start out as being clone 0 by default, we can't + // easily distinguish between callsites explicitly assigned to clone 0 + // vs those never assigned, which can lead to multiple updates of the + // calls when invoking updateCall below, with mismatched clone values. + if (!FuncCloneAssignedToCurCallsiteClone) { + FuncCloneAssignedToCurCallsiteClone = FindFirstAvailFuncClone(); + assert(FuncCloneAssignedToCurCallsiteClone); + AssignCallsiteCloneToFuncClone( + FuncCloneAssignedToCurCallsiteClone, Call, Clone, + AllocationCallToContextNodeMap.count(Call)); + } } if (VerifyCCG) { checkNode(Node); diff --git a/llvm/test/ThinLTO/X86/memprof_func_assign_fix.ll b/llvm/test/ThinLTO/X86/memprof_func_assign_fix.ll new file mode 100644 index 0000000000000..bd358509bcb3a --- /dev/null +++ b/llvm/test/ThinLTO/X86/memprof_func_assign_fix.ll @@ -0,0 +1,145 @@ +;; Make sure we assign the original callsite to a function clone (which will be +;; the original function clone), even when we cannot update its caller (due to +;; missing metadata e.g. from mismatched profiles). Otherwise we will try to use +;; the original function for a different clone, leading to confusion later when +;; rewriting the calls. + +;; -stats requires asserts +; REQUIRES: asserts + +; RUN: opt -thinlto-bc %s >%t.o +; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \ +; RUN: -supports-hot-cold-new \ +; RUN: -r=%t.o,A,plx \ +; RUN: -r=%t.o,B,plx \ +; RUN: -r=%t.o,C,plx \ +; RUN: -r=%t.o,D,plx \ +; RUN: -r=%t.o,E,plx \ +; RUN: -r=%t.o,F,plx \ +; RUN: -r=%t.o,G,plx \ +; RUN: -r=%t.o,A1,plx \ +; RUN: -r=%t.o,B1,plx \ +; RUN: -r=%t.o,_Znwm, \ +; RUN: -memprof-verify-ccg -memprof-verify-nodes -debug-only=memprof-context-disambiguation \ +; RUN: -stats -pass-remarks=memprof-context-disambiguation -save-temps \ +; RUN: -o %t.out 2>&1 | FileCheck %s \ +; RUN: --implicit-check-not="Mismatch in call clone assignment" \ +; RUN: --implicit-check-not="Number of callsites assigned to call multiple non-matching clones" + +; RUN: llvm-dis %t.out.1.4.opt.bc -o - | FileCheck %s --check-prefix=IR + +; ModuleID = '' +source_filename = "reduced.ll" +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128" +target triple = "x86_64-grtev4-linux-gnu" + +; IR-LABEL: define dso_local void @A() +define void @A() #0 { + ; IR: call void @C() + call void @C() + ret void +} + +; IR-LABEL: define dso_local void @B() +define void @B() #0 { + ; IR: call void @C.memprof.1() + call void @C(), !callsite !1 + ret void +} + +; IR-LABEL: define dso_local void @C() +define void @C() #0 { + ; IR: call void @F() + call void @F(), !callsite !16 + ; IR: call void @D() + call void @D(), !callsite !2 + ret void +} + +; IR-LABEL: define dso_local void @D() +define void @D() #0 { + ; IR: call void @E() + call void @E(), !callsite !3 + ; IR: call void @G() + call void @G(), !callsite !17 + ret void +} + +; IR-LABEL: define dso_local void @E() +define void @E() #0 { + ; IR: call ptr @_Znwm(i64 0) #[[NOTCOLD:[0-9]+]] + %1 = call ptr @_Znwm(i64 0), !memprof !4, !callsite !9 + ret void +} + +; IR-LABEL: define dso_local void @F() +define void @F() #0 { + ; IR: call void @G() + call void @G(), !callsite !17 + ret void +} + +; IR-LABEL: define dso_local void @G() +define void @G() #0 { + ; IR: call ptr @_Znwm(i64 0) #[[NOTCOLD]] + %2 = call ptr @_Znwm(i64 0), !memprof !10, !callsite !15 + ret void +} + +; IR-LABEL: define dso_local void @A1() +define void @A1() #0 { + ; IR: call void @C() + call void @C(), !callsite !18 + ret void +} + +; IR-LABEL: define dso_local void @B1() +define void @B1() #0 { + ; IR: call void @C.memprof.1() + call void @C(), !callsite !19 + ret void +} + +; IR-LABEL: define dso_local void @C.memprof.1() + ; IR: call void @F.memprof.1() + ; IR: call void @D.memprof.1() + +; IR-LABEL: define dso_local void @D.memprof.1() + ; IR: call void @E.memprof.1() + ; IR: call void @G() + +; IR-LABEL: define dso_local void @E.memprof.1() + ; IR: call ptr @_Znwm(i64 0) #[[COLD:[0-9]+]] + +; IR-LABEL: define dso_local void @F.memprof.1() + ; IR: call void @G.memprof.1() + +; IR-LABEL: define dso_local void @G.memprof.1() + ; IR: call ptr @_Znwm(i64 0) #[[COLD]] + +declare ptr @_Znwm(i64) + +attributes #0 = { noinline optnone } +; IR: attributes #[[NOTCOLD]] = { "memprof"="notcold" } +; IR: attributes #[[COLD]] = { "memprof"="cold" } + +!0 = !{i64 123} +!1 = !{i64 234} +!2 = !{i64 345} +!3 = !{i64 456} +!4 = !{!5, !7} +!5 = !{!6, !"notcold"} +!6 = !{i64 567, i64 456, i64 345, i64 123} +!7 = !{!8, !"cold"} +!8 = !{i64 567, i64 456, i64 345, i64 234} +!9 = !{i64 567} +!10 = !{!11, !13} +!11 = !{!12, !"notcold"} +!12 = !{i64 678, i64 891, i64 789, i64 912} +!13 = !{!14, !"cold"} +!14 = !{i64 678, i64 891, i64 789, i64 812} +!15 = !{i64 678} +!16 = !{i64 789} +!17 = !{i64 891} +!18 = !{i64 912} +!19 = !{i64 812} diff --git a/llvm/test/Transforms/MemProfContextDisambiguation/func_assign_fix.ll b/llvm/test/Transforms/MemProfContextDisambiguation/func_assign_fix.ll new file mode 100644 index 0000000000000..29c58c9c87ed9 --- /dev/null +++ b/llvm/test/Transforms/MemProfContextDisambiguation/func_assign_fix.ll @@ -0,0 +1,130 @@ +;; Make sure we assign the original callsite to a function clone (which will be +;; the original function clone), even when we cannot update its caller (due to +;; missing metadata e.g. from mismatched profiles). Otherwise we will try to use +;; the original function for a different clone, leading to confusion later when +;; rewriting the calls. + +;; -stats requires asserts +; REQUIRES: asserts + +; RUN: opt -passes=memprof-context-disambiguation -supports-hot-cold-new \ +; RUN: -memprof-verify-ccg -memprof-verify-nodes -stats -debug \ +; RUN: -pass-remarks=memprof-context-disambiguation %s -S 2>&1 | \ +; RUN: FileCheck %s --implicit-check-not="Mismatch in call clone assignment" \ +; RUN: --implicit-check-not="Number of callsites assigned to call multiple non-matching clones" + + +; ModuleID = '' +source_filename = "reduced.ll" +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128" +target triple = "x86_64-grtev4-linux-gnu" + +; CHECK-LABEL: define void @A() +define void @A() { + ; CHECK: call void @C() + call void @C() + ret void +} + +; CHECK-LABEL: define void @B() +define void @B() { + ; CHECK: call void @C.memprof.1() + call void @C(), !callsite !1 + ret void +} + +; CHECK-LABEL: define void @C() +define void @C() { + ; CHECK: call void @F() + call void @F(), !callsite !16 + ; CHECK: call void @D() + call void @D(), !callsite !2 + ret void +} + +; CHECK-LABEL: define void @D() +define void @D() { + ; CHECK: call void @E() + call void @E(), !callsite !3 + ; CHECK: call void @G() + call void @G(), !callsite !17 + ret void +} + +; CHECK-LABEL: define void @E() +define void @E() { + ; CHECK: call ptr @_Znwm(i64 0) #[[NOTCOLD:[0-9]+]] + %1 = call ptr @_Znwm(i64 0), !memprof !4, !callsite !9 + ret void +} + +; CHECK-LABEL: define void @F() +define void @F() { + ; CHECK: call void @G() + call void @G(), !callsite !17 + ret void +} + +; CHECK-LABEL: define void @G() +define void @G() { + ; CHECK: call ptr @_Znwm(i64 0) #[[NOTCOLD]] + %2 = call ptr @_Znwm(i64 0), !memprof !10, !callsite !15 + ret void +} + +; CHECK-LABEL: define void @A1() +define void @A1() { + ; CHECK: call void @C() + call void @C(), !callsite !18 + ret void +} + +; CHECK-LABEL: define void @B1() +define void @B1() { + ; CHECK: call void @C.memprof.1() + call void @C(), !callsite !19 + ret void +} + +; CHECK-LABEL: define void @C.memprof.1() + ; CHECK: call void @F.memprof.1() + ; CHECK: call void @D.memprof.1() + +; CHECK-LABEL: define void @D.memprof.1() + ; CHECK: call void @E.memprof.1() + ; CHECK: call void @G() + +; CHECK-LABEL: define void @E.memprof.1() + ; CHECK: call ptr @_Znwm(i64 0) #[[COLD:[0-9]+]] + +; CHECK-LABEL: define void @F.memprof.1() + ; CHECK: call void @G.memprof.1() + +; CHECK-LABEL: define void @G.memprof.1() + ; CHECK: call ptr @_Znwm(i64 0) #[[COLD]] + +declare ptr @_Znwm(i64) + +; IR: attributes #[[NOTCOLD]] = { "memprof"="notcold" } +; IR: attributes #[[COLD]] = { "memprof"="cold" } + +!0 = !{i64 123} +!1 = !{i64 234} +!2 = !{i64 345} +!3 = !{i64 456} +!4 = !{!5, !7} +!5 = !{!6, !"notcold"} +!6 = !{i64 567, i64 456, i64 345, i64 123} +!7 = !{!8, !"cold"} +!8 = !{i64 567, i64 456, i64 345, i64 234} +!9 = !{i64 567} +!10 = !{!11, !13} +!11 = !{!12, !"notcold"} +!12 = !{i64 678, i64 891, i64 789, i64 912} +!13 = !{!14, !"cold"} +!14 = !{i64 678, i64 891, i64 789, i64 812} +!15 = !{i64 678} +!16 = !{i64 789} +!17 = !{i64 891} +!18 = !{i64 912} +!19 = !{i64 812} From ca562bac080ba35dfbb87688b0b7ecc78c875f1b Mon Sep 17 00:00:00 2001 From: Teresa Johnson Date: Sun, 27 Jul 2025 10:21:46 -0700 Subject: [PATCH 2/3] Address comments --- .../IPO/MemProfContextDisambiguation.cpp | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp index e94bdd5d5e555..06aee5a1c2a75 100644 --- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp +++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp @@ -2070,7 +2070,9 @@ static unsigned getMemProfCloneNum(const Function &F) { auto Pos = F.getName().find_last_of('.'); assert(Pos > 0); unsigned CloneNo; - F.getName().drop_front(Pos + 1).getAsInteger(10, CloneNo); + auto Error = F.getName().drop_front(Pos + 1).getAsInteger(10, CloneNo); + assert(!Error); + (void)Error; return CloneNo; } @@ -4025,7 +4027,7 @@ void IndexCallsiteContextGraph::updateCall(CallInfo &CallerCall, "Caller cannot be an allocation which should not have profiled calls"); assert(CI->Clones.size() > CallerCall.cloneNo()); auto NewCalleeCloneNo = CalleeFunc.cloneNo(); - auto CurCalleeCloneNo = CI->Clones[CallerCall.cloneNo()]; + auto &CurCalleeCloneNo = CI->Clones[CallerCall.cloneNo()]; // If we already assigned this callsite to call a specific non-default // clone (i.e. not the original function which is clone 0), ensure that we // aren't trying to now update it to call a different clone, which is @@ -4036,7 +4038,7 @@ void IndexCallsiteContextGraph::updateCall(CallInfo &CallerCall, << "\n"); MismatchedCloneAssignments++; } - CI->Clones[CallerCall.cloneNo()] = NewCalleeCloneNo; + CurCalleeCloneNo = NewCalleeCloneNo; } // Update the debug information attached to NewFunc to use the clone Name. Note @@ -4753,7 +4755,8 @@ bool CallsiteContextGraph::assignFunctions() { if (!FuncCloneToCurNodeCloneMap.count(CF.first)) return CF.first; } - assert(false); + assert(false && + "Expected an available func clone for this callsite clone"); }; // See if we can use existing function clone. Walk through @@ -4900,12 +4903,16 @@ bool CallsiteContextGraph::assignFunctions() { // easily distinguish between callsites explicitly assigned to clone 0 // vs those never assigned, which can lead to multiple updates of the // calls when invoking updateCall below, with mismatched clone values. + // TODO: Add a flag to the callsite nodes or some other mechanism to + // better distinguish and identify callsite clones that are not getting + // assigned to function clones as expected. if (!FuncCloneAssignedToCurCallsiteClone) { FuncCloneAssignedToCurCallsiteClone = FindFirstAvailFuncClone(); - assert(FuncCloneAssignedToCurCallsiteClone); + assert(FuncCloneAssignedToCurCallsiteClone && + "No available func clone for this callsite clone"); AssignCallsiteCloneToFuncClone( FuncCloneAssignedToCurCallsiteClone, Call, Clone, - AllocationCallToContextNodeMap.count(Call)); + /*IsAlloc=*/AllocationCallToContextNodeMap.contains(Call)); } } if (VerifyCCG) { From 7b47a141d659a3652ed5f2a705cc1639a200d379 Mon Sep 17 00:00:00 2001 From: Teresa Johnson Date: Sun, 27 Jul 2025 10:48:41 -0700 Subject: [PATCH 3/3] Specify type and don't use Error for getAsInteger return val. --- llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp index 06aee5a1c2a75..3fe525fcab498 100644 --- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp +++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp @@ -2070,9 +2070,9 @@ static unsigned getMemProfCloneNum(const Function &F) { auto Pos = F.getName().find_last_of('.'); assert(Pos > 0); unsigned CloneNo; - auto Error = F.getName().drop_front(Pos + 1).getAsInteger(10, CloneNo); - assert(!Error); - (void)Error; + bool Err = F.getName().drop_front(Pos + 1).getAsInteger(10, CloneNo); + assert(!Err); + (void)Err; return CloneNo; }