Skip to content

Commit 302be34

Browse files
[MemProf] Make sure call clones without callsite node clones get updated (#159861)
Because we may prune differing amounts of call context for different allocation contexts during matching (we only keep enough call context to distinguish cold from noncold paths), we can end up with different numbers of callsite node clones for different callsites in the same function. Any callsites that don't have node clones for all function clones should have their copies in those other function clones updated the same way as the version in the original function, which might be calling a clone of the callsite.
1 parent 93cac97 commit 302be34

File tree

4 files changed

+382
-3
lines changed

4 files changed

+382
-3
lines changed

llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp

Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4526,6 +4526,16 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
45264526
// If Clone not already assigned to a function clone:
45274527
// Assign to first function clone without assignment
45284528
// Assign caller to selected function clone
4529+
// For each call with graph Node having clones:
4530+
// If number func clones > number call's callsite Node clones:
4531+
// Record func CallInfo clones without Node clone in UnassignedCallClones
4532+
// For callsite Nodes in DFS order from allocations:
4533+
// If IsAllocation:
4534+
// Update allocation with alloc type
4535+
// Else:
4536+
// For Call, all MatchingCalls, and associated UnnassignedCallClones:
4537+
// Update call to call recorded callee clone
4538+
//
45294539
template <typename DerivedCCG, typename FuncTy, typename CallTy>
45304540
bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
45314541
bool Changed = false;
@@ -4553,6 +4563,34 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
45534563
DenseMap<CallInfo, CallInfo> CallMap;
45544564
};
45554565

4566+
// Map to keep track of information needed to update calls in function clones
4567+
// when their corresponding callsite node was not itself cloned for that
4568+
// function clone. Because of call context pruning (i.e. we only keep as much
4569+
// caller information as needed to distinguish hot vs cold), we may not have
4570+
// caller edges coming to each callsite node from all possible function
4571+
// callers. A function clone may get created for other callsites in the
4572+
// function for which there are caller edges that were not pruned. Any other
4573+
// callsites in that function clone, which were not themselved cloned for
4574+
// that function clone, should get updated the same way as the corresponding
4575+
// callsite in the original function (which may call a clone of its callee).
4576+
//
4577+
// We build this map after completing function cloning for each function, so
4578+
// that we can record the information from its call maps before they are
4579+
// destructed. The map will be used as we update calls to update any still
4580+
// unassigned call clones. Note that we may create new node clones as we clone
4581+
// other functions, so later on we check which node clones were still not
4582+
// created. To this end, the inner map is a map from function clone number to
4583+
// the list of calls cloned for that function (can be more than one due to the
4584+
// Node's MatchingCalls array).
4585+
//
4586+
// The alternative is creating new callsite clone nodes below as we clone the
4587+
// function, but that is tricker to get right and likely more overhead.
4588+
//
4589+
// Inner map is a std::map so sorted by key (clone number), in order to get
4590+
// ordered remarks in the full LTO case.
4591+
DenseMap<const ContextNode *, std::map<unsigned, SmallVector<CallInfo, 0>>>
4592+
UnassignedCallClones;
4593+
45564594
// Walk all functions for which we saw calls with memprof metadata, and handle
45574595
// cloning for each of its calls.
45584596
for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) {
@@ -4996,6 +5034,63 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
49965034
}
49975035
}
49985036
}
5037+
5038+
if (FuncCloneInfos.size() < 2)
5039+
continue;
5040+
5041+
// In this case there is more than just the original function copy.
5042+
// Record call clones of any callsite nodes in the function that did not
5043+
// themselves get cloned for all of the function clones.
5044+
for (auto &Call : CallsWithMetadata) {
5045+
ContextNode *Node = getNodeForInst(Call);
5046+
if (!Node || !Node->hasCall() || Node->emptyContextIds())
5047+
continue;
5048+
// If Node has enough clones already to cover all function clones, we can
5049+
// skip it. Need to add one for the original copy.
5050+
// Use >= in case there were clones that were skipped due to having empty
5051+
// context ids
5052+
if (Node->Clones.size() + 1 >= FuncCloneInfos.size())
5053+
continue;
5054+
// First collect all function clones we cloned this callsite node for.
5055+
// They may not be sequential due to empty clones e.g.
5056+
DenseSet<unsigned> NodeCallClones;
5057+
for (auto *C : Node->Clones)
5058+
NodeCallClones.insert(C->Call.cloneNo());
5059+
unsigned I = 0;
5060+
// Now check all the function clones.
5061+
for (auto &FC : FuncCloneInfos) {
5062+
// Function clones should be sequential.
5063+
assert(FC.FuncClone.cloneNo() == I);
5064+
// Skip the first clone which got the original call.
5065+
// Also skip any other clones created for this Node.
5066+
if (++I == 1 || NodeCallClones.contains(I)) {
5067+
continue;
5068+
}
5069+
// Record the call clones created for this callsite in this function
5070+
// clone.
5071+
auto &CallVector = UnassignedCallClones[Node][I];
5072+
DenseMap<CallInfo, CallInfo> &CallMap = FC.CallMap;
5073+
if (auto It = CallMap.find(Call); It != CallMap.end()) {
5074+
CallInfo CallClone = It->second;
5075+
CallVector.push_back(CallClone);
5076+
} else {
5077+
// All but the original clone (skipped earlier) should have an entry
5078+
// for all calls.
5079+
assert(false && "Expected to find call in CallMap");
5080+
}
5081+
// Need to do the same for all matching calls.
5082+
for (auto &MatchingCall : Node->MatchingCalls) {
5083+
if (auto It = CallMap.find(MatchingCall); It != CallMap.end()) {
5084+
CallInfo CallClone = It->second;
5085+
CallVector.push_back(CallClone);
5086+
} else {
5087+
// All but the original clone (skipped earlier) should have an entry
5088+
// for all calls.
5089+
assert(false && "Expected to find call in CallMap");
5090+
}
5091+
}
5092+
}
5093+
}
49995094
}
50005095

50015096
uint8_t BothTypes =
@@ -5057,6 +5152,26 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
50575152
// Update all the matching calls as well.
50585153
for (auto &Call : Node->MatchingCalls)
50595154
updateCall(Call, CalleeFunc);
5155+
5156+
// Now update all calls recorded earlier that are still in function clones
5157+
// which don't have a clone of this callsite node.
5158+
if (!UnassignedCallClones.contains(Node))
5159+
return;
5160+
DenseSet<unsigned> NodeCallClones;
5161+
for (auto *C : Node->Clones)
5162+
NodeCallClones.insert(C->Call.cloneNo());
5163+
// Note that we already confirmed Node is in this map a few lines above.
5164+
auto &ClonedCalls = UnassignedCallClones[Node];
5165+
for (auto &[CloneNo, CallVector] : ClonedCalls) {
5166+
// Should start at 1 as we never create an entry for original node.
5167+
assert(CloneNo > 0);
5168+
// If we subsequently created a clone, skip this one.
5169+
if (NodeCallClones.contains(CloneNo))
5170+
continue;
5171+
// Use the original Node's CalleeFunc.
5172+
for (auto &Call : CallVector)
5173+
updateCall(Call, CalleeFunc);
5174+
}
50605175
};
50615176

50625177
// Performs DFS traversal starting from allocation nodes to update calls to
Lines changed: 142 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,142 @@
1+
;; Similar to funcassigncloning.ll but hand modified to add another allocation
2+
;; whose pruned cold context only includes an immediate caller node that itself
3+
;; doesn't need cloning, but calls a cloned allocating function, and is in a
4+
;; function that gets cloned multiple times for a different callsite. This test
5+
;; makes sure the non-cloned callsite is correctly updated in all function
6+
;; clones. This case was missed because, due to context pruning, we don't have
7+
;; any caller edges for the first callsite, so the handling that kicks in to
8+
;; "reclone" other callsites in cloned functions was being missed.
9+
10+
; RUN: opt -thinlto-bc %s >%t.o
11+
; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \
12+
; RUN: -supports-hot-cold-new \
13+
; RUN: -r=%t.o,main,plx \
14+
; RUN: -r=%t.o,_Znam, \
15+
; RUN: -memprof-verify-ccg -memprof-verify-nodes \
16+
; RUN: -pass-remarks=memprof-context-disambiguation -save-temps \
17+
; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=REMARKS
18+
19+
; RUN: llvm-dis %t.out.1.4.opt.bc -o - | FileCheck %s --check-prefix=IR
20+
21+
22+
;; Try again but with distributed ThinLTO
23+
; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \
24+
; RUN: -supports-hot-cold-new \
25+
; RUN: -thinlto-distributed-indexes \
26+
; RUN: -r=%t.o,main,plx \
27+
; RUN: -r=%t.o,_Znam, \
28+
; RUN: -memprof-verify-ccg -memprof-verify-nodes \
29+
; RUN: -pass-remarks=memprof-context-disambiguation \
30+
; RUN: -o %t2.out
31+
32+
;; Run ThinLTO backend
33+
; RUN: opt -passes=memprof-context-disambiguation \
34+
; RUN: -memprof-import-summary=%t.o.thinlto.bc \
35+
; RUN: -pass-remarks=memprof-context-disambiguation \
36+
; RUN: %t.o -S 2>&1 | FileCheck %s --check-prefix=IR \
37+
; RUN: --check-prefix=REMARKS
38+
39+
40+
source_filename = "funcassigncloning.ll"
41+
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
42+
target triple = "x86_64-unknown-linux-gnu"
43+
44+
;; Eventually this function will be cloned several times (for the calls to new
45+
;; for the various callers). However, function blah() includes an allocation
46+
;; whose cold context was trimmed above here. We therefore should assume that
47+
;; every caller of this function should call the same version of blah (which
48+
;; will be the cloned ".memprof.1" version.
49+
; Function Attrs: noinline optnone
50+
define internal void @_Z1EPPcS0_(ptr %buf1, ptr %buf2) #0 {
51+
entry:
52+
call void @blah(), !callsite !19
53+
%call = call ptr @_Znam(i64 noundef 10), !memprof !0, !callsite !7
54+
%call1 = call ptr @_Znam(i64 noundef 10), !memprof !8, !callsite !15
55+
ret void
56+
}
57+
58+
; REMARKS: call in clone _Z1EPPcS0_ assigned to call function clone blah.memprof.1
59+
; REMARKS: call in clone _Z1EPPcS0_.memprof.1 assigned to call function clone blah.memprof.1
60+
; REMARKS: call in clone _Z1EPPcS0_.memprof.2 assigned to call function clone blah.memprof.1
61+
; REMARKS: call in clone _Z1EPPcS0_.memprof.3 assigned to call function clone blah.memprof.1
62+
63+
; IR: define {{.*}} @_Z1EPPcS0_
64+
; IR: call {{.*}} @blah.memprof.1()
65+
; IR: define {{.*}} @_Z1EPPcS0_.memprof.2
66+
; IR: call {{.*}} @blah.memprof.1()
67+
; IR: define {{.*}} @_Z1EPPcS0_.memprof.3
68+
; IR: call {{.*}} @blah.memprof.1()
69+
70+
declare ptr @_Znam(i64)
71+
72+
define internal void @_Z1BPPcS0_() {
73+
entry:
74+
call void @_Z1EPPcS0_(ptr null, ptr null), !callsite !16
75+
ret void
76+
}
77+
78+
define internal void @_Z1CPPcS0_() {
79+
entry:
80+
call void @_Z1EPPcS0_(ptr null, ptr null), !callsite !17
81+
ret void
82+
}
83+
84+
define internal void @_Z1DPPcS0_() {
85+
entry:
86+
call void @_Z1EPPcS0_(ptr null, ptr null), !callsite !18
87+
ret void
88+
}
89+
90+
; Function Attrs: noinline optnone
91+
define i32 @main() #0 {
92+
entry:
93+
call void @_Z1BPPcS0_()
94+
call void @_Z1CPPcS0_()
95+
call void @_Z1DPPcS0_()
96+
ret i32 0
97+
}
98+
99+
define internal void @blah() #0 {
100+
entry:
101+
%call = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #6, !memprof !22, !callsite !21
102+
ret void
103+
}
104+
105+
define internal void @foo() #0 {
106+
entry:
107+
call void @blah(), !callsite !20
108+
ret void
109+
}
110+
111+
; uselistorder directives
112+
uselistorder ptr @_Znam, { 1, 0, 2 }
113+
114+
attributes #0 = { noinline optnone }
115+
116+
!0 = !{!1, !3, !5}
117+
!1 = !{!2, !"cold"}
118+
!2 = !{i64 -3461278137325233666, i64 -7799663586031895603}
119+
!3 = !{!4, !"notcold"}
120+
!4 = !{i64 -3461278137325233666, i64 -3483158674395044949}
121+
!5 = !{!6, !"notcold"}
122+
!6 = !{i64 -3461278137325233666, i64 -2441057035866683071}
123+
!7 = !{i64 -3461278137325233666}
124+
!8 = !{!9, !11, !13}
125+
!9 = !{!10, !"notcold"}
126+
!10 = !{i64 -1415475215210681400, i64 -2441057035866683071}
127+
!11 = !{!12, !"cold"}
128+
!12 = !{i64 -1415475215210681400, i64 -3483158674395044949}
129+
!13 = !{!14, !"notcold"}
130+
!14 = !{i64 -1415475215210681400, i64 -7799663586031895603}
131+
!15 = !{i64 -1415475215210681400}
132+
!16 = !{i64 -2441057035866683071}
133+
!17 = !{i64 -3483158674395044949}
134+
!18 = !{i64 -7799663586031895603}
135+
!19 = !{i64 123}
136+
!20 = !{i64 234}
137+
!21 = !{i64 345}
138+
!22 = !{!23, !25}
139+
!23 = !{!24, !"cold"}
140+
!24 = !{i64 345, i64 123}
141+
!25 = !{!26, !"notcold"}
142+
!26 = !{i64 345, i64 234}

0 commit comments

Comments
 (0)