llvm · teresajohnson · Sep 25, 2025 · Sep 19, 2025 · Sep 25, 2025 · Sep 25, 2025
diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
@@ -4553,6 +4553,34 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
     DenseMap<CallInfo, CallInfo> CallMap;
   };
 
+  // Map to keep track of information needed to update calls in function clones
+  // when their corresponding callsite node was not itself cloned for that
+  // function clone. Because of call context pruning (i.e. we only keep as much
+  // caller information as needed to distinguish hot vs cold), we may not have
+  // caller edges coming to each callsite node from all possible function
+  // callers. A function clone may get created for other callsites in the
+  // function for which there are caller edges that were not pruned. Any other
+  // callsites in that function clone, which were not themselved cloned for
+  // that function clone, should get updated the same way as the corresponding
+  // callsite in the original function (which may call a clone of its callee).
+  //
+  // We build this map after completing function cloning for each function, so
+  // that we can record the information from its call maps before they are
+  // destructed. The map will be used as we update calls to update any still
+  // unassigned call clones. Note that we may create new node clones as we clone
+  // other functions, so later on we check which node clones were still not
+  // created. To this end, the inner map is a map from function clone number to
+  // the list of calls cloned for that function (can be more than one due to the
+  // Node's MatchingCalls array).
+  //
+  // The alternative is creating new callsite clone nodes below as we clone the
+  // function, but that is tricker to get right and likely more overhead.
+  //
+  // Inner map is a std::map so sorted by key (clone number), in order to get
+  // ordered remarks in the full LTO case.
+  DenseMap<const ContextNode *, std::map<unsigned, SmallVector<CallInfo, 0>>>
+      UnassignedCallClones;
+
   // Walk all functions for which we saw calls with memprof metadata, and handle
   // cloning for each of its calls.
   for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) {
@@ -4986,6 +5014,63 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
         }
       }
     }
+
+    if (FuncCloneInfos.size() < 2)
+      continue;
+
+    // In this case there is more than just the original function copy.
+    // Record call clones of any callsite nodes in the function that did not
+    // themselves get cloned for all of the function clones.
+    for (auto &Call : CallsWithMetadata) {
+      ContextNode *Node = getNodeForInst(Call);
+      if (!Node || !Node->hasCall() || Node->emptyContextIds())
+        continue;
+      // If Node has enough clones already to cover all function clones, we can
+      // skip it. Need to add one for the original copy.
+      // Use >= in case there were clones that were skipped due to having empty
+      // context ids
+      if (Node->Clones.size() + 1 >= FuncCloneInfos.size())
+        continue;
+      // First collect all function clones we cloned this callsite node for.
+      // They may not be sequential due to empty clones e.g.
+      DenseSet<unsigned> NodeCallClones;
+      for (auto *C : Node->Clones)
+        NodeCallClones.insert(C->Call.cloneNo());
+      unsigned I = 0;
+      // Now check all the function clones.
+      for (auto &FC : FuncCloneInfos) {
+        // Function clones should be sequential.
+        assert(FC.FuncClone.cloneNo() == I);
+        // Skip the first clone which got the original call.
+        // Also skip any other clones created for this Node.
+        if (++I == 1 || NodeCallClones.contains(I)) {
+          continue;
+        }
+        // Record the call clones created for this callsite in this function
+        // clone.
+        auto &CallVector = UnassignedCallClones[Node][I];
+        DenseMap<CallInfo, CallInfo> &CallMap = FC.CallMap;
+        CallInfo CallClone(Call);
+        if (auto It = CallMap.find(Call); It != CallMap.end())
+          CallClone = It->second;
+        else
+          // All but the original clone (skipped earlier) should have an entry
+          // for all calls.
+          assert(false && "Expected to find call in CallMap");
+        CallVector.push_back(CallClone);
+        // Need to do the same for all matching calls.
+        for (auto &MatchingCall : Node->MatchingCalls) {
+          CallInfo CallClone(MatchingCall);
+          if (auto It = CallMap.find(MatchingCall); It != CallMap.end())
+            CallClone = It->second;
+          else
+            // All but the original clone (skipped earlier) should have an entry
+            // for all calls.
+            assert(false && "Expected to find call in CallMap");
+          CallVector.push_back(CallClone);
+        }
+      }
+    }
   }
 
   uint8_t BothTypes =
@@ -5047,6 +5132,25 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
     // Update all the matching calls as well.
     for (auto &Call : Node->MatchingCalls)
       updateCall(Call, CalleeFunc);
+
+    // Now update all calls recorded earlier that are still in function clones
+    // which don't have a clone of this callsite node.
+    if (!UnassignedCallClones.contains(Node))
+      return;
+    DenseSet<unsigned> NodeCallClones;
+    for (auto *C : Node->Clones)
+      NodeCallClones.insert(C->Call.cloneNo());
+    auto &ClonedCalls = UnassignedCallClones[Node];
+    for (auto &[CloneNo, CallVector] : ClonedCalls) {
+      // Should start at 1 as we never create an entry for original node.
+      assert(CloneNo > 0);
+      // If we subsequently created a clone, skip this one.
+      if (NodeCallClones.contains(CloneNo))
+        continue;
+      // Use the original Node's CalleeFunc.
+      for (auto &Call : CallVector)
+        updateCall(Call, CalleeFunc);
+    }
   };
 
   // Performs DFS traversal starting from allocation nodes to update calls to

diff --git a/llvm/test/ThinLTO/X86/memprof-funcassigncloning2.ll b/llvm/test/ThinLTO/X86/memprof-funcassigncloning2.ll
@@ -0,0 +1,142 @@
+;; Similar to funcassigncloning.ll but hand modified to add another allocation
+;; whose pruned cold context only includes an immediate caller node that itself
+;; doesn't need cloning, but calls a cloned allocating function, and is in a
+;; function that gets cloned multiple times for a different callsite. This test
+;; makes sure the non-cloned callsite is correctly updated in all function
+;; clones. This case was missed because due to context pruning we don't have any
+;; caller edges for the first callsite, so the handling that kicks in to
+;; "reclone" other callsites in cloned functions was being missed.
+
+; RUN: opt -thinlto-bc %s >%t.o
+; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \
+; RUN:  -supports-hot-cold-new \
+; RUN:  -r=%t.o,main,plx \
+; RUN:  -r=%t.o,_Znam, \
+; RUN:  -memprof-verify-ccg -memprof-verify-nodes \
+; RUN:  -pass-remarks=memprof-context-disambiguation -save-temps \
+; RUN:  -o %t.out 2>&1 | FileCheck %s --check-prefix=REMARKS
+
+; RUN: llvm-dis %t.out.1.4.opt.bc -o - | FileCheck %s --check-prefix=IR
+
+
+;; Try again but with distributed ThinLTO
+; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \
+; RUN:  -supports-hot-cold-new \
+; RUN:  -thinlto-distributed-indexes \
+; RUN:  -r=%t.o,main,plx \
+; RUN:  -r=%t.o,_Znam, \
+; RUN:  -memprof-verify-ccg -memprof-verify-nodes \
+; RUN:  -pass-remarks=memprof-context-disambiguation \
+; RUN:  -o %t2.out
+
+;; Run ThinLTO backend
+; RUN: opt -passes=memprof-context-disambiguation \
+; RUN:  -memprof-import-summary=%t.o.thinlto.bc \
+; RUN:  -pass-remarks=memprof-context-disambiguation \
+; RUN:  %t.o -S 2>&1 | FileCheck %s --check-prefix=IR \
+; RUN:  --check-prefix=REMARKS
+
+
+source_filename = "funcassigncloning.ll"
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+;; Eventually this function will be cloned several times (for the calls to new
+;; for the various callers). However, function blah() includes an allocation
+;; whose cold context was trimmed above here. We therefore should assume that
+;; every caller of this function should call the same version of blah (which
+;; will be the cloned ".memprof.1" version.
+; Function Attrs: noinline optnone
+define internal void @_Z1EPPcS0_(ptr %buf1, ptr %buf2) #0 {
+entry:
+  call void @blah(), !callsite !19
+  %call = call ptr @_Znam(i64 noundef 10), !memprof !0, !callsite !7
+  %call1 = call ptr @_Znam(i64 noundef 10), !memprof !8, !callsite !15
+  ret void
+}
+
+; REMARKS: call in clone _Z1EPPcS0_ assigned to call function clone blah.memprof.1
+; REMARKS: call in clone _Z1EPPcS0_.memprof.1 assigned to call function clone blah.memprof.1
+; REMARKS: call in clone _Z1EPPcS0_.memprof.2 assigned to call function clone blah.memprof.1
+; REMARKS: call in clone _Z1EPPcS0_.memprof.3 assigned to call function clone blah.memprof.1
+
+; IR: define {{.*}} @_Z1EPPcS0_
+; IR:   call {{.*}} @blah.memprof.1()
+; IR: define {{.*}} @_Z1EPPcS0_.memprof.2
+; IR:   call {{.*}} @blah.memprof.1()
+; IR: define {{.*}} @_Z1EPPcS0_.memprof.3
+; IR:   call {{.*}} @blah.memprof.1()
+
+declare ptr @_Znam(i64)
+
+define internal void @_Z1BPPcS0_() {
+entry:
+  call void @_Z1EPPcS0_(ptr null, ptr null), !callsite !16
+  ret void
+}
+
+define internal void @_Z1CPPcS0_() {
+entry:
+  call void @_Z1EPPcS0_(ptr null, ptr null), !callsite !17
+  ret void
+}
+
+define internal void @_Z1DPPcS0_() {
+entry:
+  call void @_Z1EPPcS0_(ptr null, ptr null), !callsite !18
+  ret void
+}
+
+; Function Attrs: noinline optnone
+define i32 @main() #0 {
+entry:
+  call void @_Z1BPPcS0_()
+  call void @_Z1CPPcS0_()
+  call void @_Z1DPPcS0_()
+  ret i32 0
+}
+
+define internal void @blah() #0 {
+entry:
+  %call = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #6, !memprof !22, !callsite !21
+  ret void
+}
+
+define internal void @foo() #0 {
+entry:
+  call void @blah(), !callsite !20
+  ret void
+}
+
+; uselistorder directives
+uselistorder ptr @_Znam, { 1, 0, 2 }
+
+attributes #0 = { noinline optnone }
+
+!0 = !{!1, !3, !5}
+!1 = !{!2, !"cold"}
+!2 = !{i64 -3461278137325233666, i64 -7799663586031895603}
+!3 = !{!4, !"notcold"}
+!4 = !{i64 -3461278137325233666, i64 -3483158674395044949}
+!5 = !{!6, !"notcold"}
+!6 = !{i64 -3461278137325233666, i64 -2441057035866683071}
+!7 = !{i64 -3461278137325233666}
+!8 = !{!9, !11, !13}
+!9 = !{!10, !"notcold"}
+!10 = !{i64 -1415475215210681400, i64 -2441057035866683071}
+!11 = !{!12, !"cold"}
+!12 = !{i64 -1415475215210681400, i64 -3483158674395044949}
+!13 = !{!14, !"notcold"}
+!14 = !{i64 -1415475215210681400, i64 -7799663586031895603}
+!15 = !{i64 -1415475215210681400}
+!16 = !{i64 -2441057035866683071}
+!17 = !{i64 -3483158674395044949}
+!18 = !{i64 -7799663586031895603}
+!19 = !{i64 123}
+!20 = !{i64 234}
+!21 = !{i64 345}
+!22 = !{!23, !25}
+!23 = !{!24, !"cold"}
+!24 = !{i64 345, i64 123}
+!25 = !{!26, !"notcold"}
+!26 = !{i64 345, i64 234}
diff --git a/llvm/test/Transforms/MemProfContextDisambiguation/funcassigncloning2.ll b/llvm/test/Transforms/MemProfContextDisambiguation/funcassigncloning2.ll
@@ -0,0 +1,122 @@
+;; Similar to funcassigncloning.ll but hand modified to add another allocation
+;; whose pruned cold context only includes an immediate caller node that itself
+;; doesn't need cloning, but calls a cloned allocating function, and is in a
+;; function that gets cloned multiple times for a different callsite. This test
+;; makes sure the non-cloned callsite is correctly updated in all function
+;; clones. This case was missed because due to context pruning we don't have any
+;; caller edges for the first callsite, so the handling that kicks in to
+;; "reclone" other callsites in cloned functions was being missed.
+
+; RUN: opt -passes=memprof-context-disambiguation -supports-hot-cold-new \
+; RUN:  -memprof-verify-ccg -memprof-verify-nodes \
+; RUN:  -pass-remarks=memprof-context-disambiguation \
+; RUN:  %s -S 2>&1 | FileCheck %s --check-prefix=IR --check-prefix=REMARKS
+
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+;; Eventually this function will be cloned several times (for the calls to new
+;; for the various callers). However, function blah() includes an allocation
+;; whose cold context was trimmed above here. We therefore should assume that
+;; every caller of this function should call the same version of blah (which
+;; will be the cloned ".memprof.1" version.
+define internal void @_Z1EPPcS0_(ptr %buf1, ptr %buf2) #0 {
+entry:
+  call void @blah(), !callsite !19
+  %call = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #6, !memprof !0, !callsite !7
+  %call1 = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #6, !memprof !8, !callsite !15
+  ret void
+}
+
+; REMARKS: created clone blah.memprof.1
+; REMARKS: call in clone _Z1EPPcS0_ assigned to call function clone blah.memprof.1
+; REMARKS: call in clone _Z1EPPcS0_.memprof.1 assigned to call function clone blah.memprof.1
+; REMARKS: call in clone _Z1EPPcS0_.memprof.2 assigned to call function clone blah.memprof.1
+; REMARKS: call in clone _Z1EPPcS0_.memprof.3 assigned to call function clone blah.memprof.1
+
+; IR: define {{.*}} @_Z1EPPcS0_
+; IR:   call {{.*}} @blah.memprof.1()
+; IR: define {{.*}} @_Z1EPPcS0_.memprof.1
+; IR:   call {{.*}} @blah.memprof.1()
+; IR: define {{.*}} @_Z1EPPcS0_.memprof.2
+; IR:   call {{.*}} @blah.memprof.1()
+; IR: define {{.*}} @_Z1EPPcS0_.memprof.3
+; IR:   call {{.*}} @blah.memprof.1()
+
+declare ptr @_Znam(i64) #1
+
+define internal void @_Z1BPPcS0_(ptr %0, ptr %1) {
+entry:
+  call void @_Z1EPPcS0_(ptr noundef %0, ptr noundef %1), !callsite !16
+  ret void
+}
+
+; Function Attrs: noinline
+define internal void @_Z1CPPcS0_(ptr %0, ptr %1) #2 {
+entry:
+  call void @_Z1EPPcS0_(ptr noundef %0, ptr noundef %1), !callsite !17
+  ret void
+}
+
+define internal void @_Z1DPPcS0_(ptr %0, ptr %1) #3 {
+entry:
+  call void @_Z1EPPcS0_(ptr noundef %0, ptr noundef %1), !callsite !18
+  ret void
+}
+
+define internal void @blah() #0 {
+entry:
+  %call = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #6, !memprof !22, !callsite !21
+  ret void
+}
+
+define internal void @foo() #0 {
+entry:
+  call void @blah(), !callsite !20
+  ret void
+}
+
+; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: write)
+declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) #4
+
+declare i32 @sleep() #5
+
+; uselistorder directives
+uselistorder ptr @_Znam, { 1, 0, 2 }
+
+attributes #0 = { "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" }
+attributes #1 = { "no-trapping-math"="true" }
+attributes #2 = { noinline }
+attributes #3 = { "frame-pointer"="all" }
+attributes #4 = { nocallback nofree nounwind willreturn memory(argmem: write) }
+attributes #5 = { "disable-tail-calls"="true" }
+attributes #6 = { builtin }
+
+!0 = !{!1, !3, !5}
+!1 = !{!2, !"cold"}
+!2 = !{i64 -3461278137325233666, i64 -7799663586031895603}
+!3 = !{!4, !"notcold"}
+!4 = !{i64 -3461278137325233666, i64 -3483158674395044949}
+!5 = !{!6, !"notcold"}
+!6 = !{i64 -3461278137325233666, i64 -2441057035866683071}
+!7 = !{i64 -3461278137325233666}
+!8 = !{!9, !11, !13}
+!9 = !{!10, !"notcold"}
+!10 = !{i64 -1415475215210681400, i64 -2441057035866683071}
+!11 = !{!12, !"cold"}
+!12 = !{i64 -1415475215210681400, i64 -3483158674395044949}
+!13 = !{!14, !"notcold"}
+!14 = !{i64 -1415475215210681400, i64 -7799663586031895603}
+!15 = !{i64 -1415475215210681400}
+!16 = !{i64 -2441057035866683071}
+!17 = !{i64 -3483158674395044949}
+!18 = !{i64 -7799663586031895603}
+!19 = !{i64 123}
+!20 = !{i64 234}
+!21 = !{i64 345}
+!22 = !{!23, !25}
+!23 = !{!24, !"cold"}
+!24 = !{i64 345, i64 123}
+!25 = !{!26, !"notcold"}
+!26 = !{i64 345, i64 234}