-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[MemProf] Make sure call clones without callsite node clones get updated #159861
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -4553,6 +4553,34 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() { | |
| DenseMap<CallInfo, CallInfo> CallMap; | ||
| }; | ||
|
|
||
| // Map to keep track of information needed to update calls in function clones | ||
| // when their corresponding callsite node was not itself cloned for that | ||
| // function clone. Because of call context pruning (i.e. we only keep as much | ||
| // caller information as needed to distinguish hot vs cold), we may not have | ||
| // caller edges coming to each callsite node from all possible function | ||
| // callers. A function clone may get created for other callsites in the | ||
| // function for which there are caller edges that were not pruned. Any other | ||
| // callsites in that function clone, which were not themselved cloned for | ||
| // that function clone, should get updated the same way as the corresponding | ||
| // callsite in the original function (which may call a clone of its callee). | ||
| // | ||
| // We build this map after completing function cloning for each function, so | ||
| // that we can record the information from its call maps before they are | ||
| // destructed. The map will be used as we update calls to update any still | ||
| // unassigned call clones. Note that we may create new node clones as we clone | ||
| // other functions, so later on we check which node clones were still not | ||
| // created. To this end, the inner map is a map from function clone number to | ||
| // the list of calls cloned for that function (can be more than one due to the | ||
| // Node's MatchingCalls array). | ||
| // | ||
| // The alternative is creating new callsite clone nodes below as we clone the | ||
| // function, but that is tricker to get right and likely more overhead. | ||
| // | ||
| // Inner map is a std::map so sorted by key (clone number), in order to get | ||
| // ordered remarks in the full LTO case. | ||
| DenseMap<const ContextNode *, std::map<unsigned, SmallVector<CallInfo, 0>>> | ||
| UnassignedCallClones; | ||
|
|
||
| // Walk all functions for which we saw calls with memprof metadata, and handle | ||
| // cloning for each of its calls. | ||
| for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) { | ||
|
|
@@ -4986,6 +5014,63 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() { | |
| } | ||
| } | ||
| } | ||
|
|
||
| if (FuncCloneInfos.size() < 2) | ||
| continue; | ||
|
|
||
| // In this case there is more than just the original function copy. | ||
| // Record call clones of any callsite nodes in the function that did not | ||
| // themselves get cloned for all of the function clones. | ||
| for (auto &Call : CallsWithMetadata) { | ||
| ContextNode *Node = getNodeForInst(Call); | ||
| if (!Node || !Node->hasCall() || Node->emptyContextIds()) | ||
| continue; | ||
| // If Node has enough clones already to cover all function clones, we can | ||
| // skip it. Need to add one for the original copy. | ||
| // Use >= in case there were clones that were skipped due to having empty | ||
| // context ids | ||
| if (Node->Clones.size() + 1 >= FuncCloneInfos.size()) | ||
| continue; | ||
| // First collect all function clones we cloned this callsite node for. | ||
| // They may not be sequential due to empty clones e.g. | ||
| DenseSet<unsigned> NodeCallClones; | ||
| for (auto *C : Node->Clones) | ||
| NodeCallClones.insert(C->Call.cloneNo()); | ||
| unsigned I = 0; | ||
| // Now check all the function clones. | ||
| for (auto &FC : FuncCloneInfos) { | ||
| // Function clones should be sequential. | ||
| assert(FC.FuncClone.cloneNo() == I); | ||
| // Skip the first clone which got the original call. | ||
| // Also skip any other clones created for this Node. | ||
| if (++I == 1 || NodeCallClones.contains(I)) { | ||
| continue; | ||
| } | ||
| // Record the call clones created for this callsite in this function | ||
| // clone. | ||
| auto &CallVector = UnassignedCallClones[Node][I]; | ||
| DenseMap<CallInfo, CallInfo> &CallMap = FC.CallMap; | ||
| CallInfo CallClone(Call); | ||
| if (auto It = CallMap.find(Call); It != CallMap.end()) | ||
| CallClone = It->second; | ||
| else | ||
| // All but the original clone (skipped earlier) should have an entry | ||
| // for all calls. | ||
| assert(false && "Expected to find call in CallMap"); | ||
| CallVector.push_back(CallClone); | ||
|
||
| // Need to do the same for all matching calls. | ||
| for (auto &MatchingCall : Node->MatchingCalls) { | ||
| CallInfo CallClone(MatchingCall); | ||
|
||
| if (auto It = CallMap.find(MatchingCall); It != CallMap.end()) | ||
| CallClone = It->second; | ||
| else | ||
| // All but the original clone (skipped earlier) should have an entry | ||
| // for all calls. | ||
| assert(false && "Expected to find call in CallMap"); | ||
| CallVector.push_back(CallClone); | ||
| } | ||
| } | ||
| } | ||
| } | ||
|
|
||
| uint8_t BothTypes = | ||
|
|
@@ -5047,6 +5132,25 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() { | |
| // Update all the matching calls as well. | ||
| for (auto &Call : Node->MatchingCalls) | ||
| updateCall(Call, CalleeFunc); | ||
|
|
||
| // Now update all calls recorded earlier that are still in function clones | ||
| // which don't have a clone of this callsite node. | ||
| if (!UnassignedCallClones.contains(Node)) | ||
| return; | ||
| DenseSet<unsigned> NodeCallClones; | ||
| for (auto *C : Node->Clones) | ||
| NodeCallClones.insert(C->Call.cloneNo()); | ||
| auto &ClonedCalls = UnassignedCallClones[Node]; | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit:
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We can't do this because at() returns a const reference. Note a few lines above we continue if the entry doesn't exist - I added a comment to this effect. |
||
| for (auto &[CloneNo, CallVector] : ClonedCalls) { | ||
| // Should start at 1 as we never create an entry for original node. | ||
| assert(CloneNo > 0); | ||
| // If we subsequently created a clone, skip this one. | ||
| if (NodeCallClones.contains(CloneNo)) | ||
| continue; | ||
| // Use the original Node's CalleeFunc. | ||
| for (auto &Call : CallVector) | ||
| updateCall(Call, CalleeFunc); | ||
| } | ||
| }; | ||
|
|
||
| // Performs DFS traversal starting from allocation nodes to update calls to | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,142 @@ | ||
| ;; Similar to funcassigncloning.ll but hand modified to add another allocation | ||
| ;; whose pruned cold context only includes an immediate caller node that itself | ||
| ;; doesn't need cloning, but calls a cloned allocating function, and is in a | ||
| ;; function that gets cloned multiple times for a different callsite. This test | ||
| ;; makes sure the non-cloned callsite is correctly updated in all function | ||
| ;; clones. This case was missed because due to context pruning we don't have any | ||
|
||
| ;; caller edges for the first callsite, so the handling that kicks in to | ||
| ;; "reclone" other callsites in cloned functions was being missed. | ||
|
|
||
| ; RUN: opt -thinlto-bc %s >%t.o | ||
| ; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \ | ||
| ; RUN: -supports-hot-cold-new \ | ||
| ; RUN: -r=%t.o,main,plx \ | ||
| ; RUN: -r=%t.o,_Znam, \ | ||
| ; RUN: -memprof-verify-ccg -memprof-verify-nodes \ | ||
| ; RUN: -pass-remarks=memprof-context-disambiguation -save-temps \ | ||
| ; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=REMARKS | ||
|
|
||
| ; RUN: llvm-dis %t.out.1.4.opt.bc -o - | FileCheck %s --check-prefix=IR | ||
|
|
||
|
|
||
| ;; Try again but with distributed ThinLTO | ||
| ; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \ | ||
| ; RUN: -supports-hot-cold-new \ | ||
| ; RUN: -thinlto-distributed-indexes \ | ||
| ; RUN: -r=%t.o,main,plx \ | ||
| ; RUN: -r=%t.o,_Znam, \ | ||
| ; RUN: -memprof-verify-ccg -memprof-verify-nodes \ | ||
| ; RUN: -pass-remarks=memprof-context-disambiguation \ | ||
| ; RUN: -o %t2.out | ||
|
|
||
| ;; Run ThinLTO backend | ||
| ; RUN: opt -passes=memprof-context-disambiguation \ | ||
| ; RUN: -memprof-import-summary=%t.o.thinlto.bc \ | ||
| ; RUN: -pass-remarks=memprof-context-disambiguation \ | ||
| ; RUN: %t.o -S 2>&1 | FileCheck %s --check-prefix=IR \ | ||
| ; RUN: --check-prefix=REMARKS | ||
|
|
||
|
|
||
| source_filename = "funcassigncloning.ll" | ||
| target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" | ||
| target triple = "x86_64-unknown-linux-gnu" | ||
|
|
||
| ;; Eventually this function will be cloned several times (for the calls to new | ||
| ;; for the various callers). However, function blah() includes an allocation | ||
| ;; whose cold context was trimmed above here. We therefore should assume that | ||
| ;; every caller of this function should call the same version of blah (which | ||
| ;; will be the cloned ".memprof.1" version. | ||
| ; Function Attrs: noinline optnone | ||
| define internal void @_Z1EPPcS0_(ptr %buf1, ptr %buf2) #0 { | ||
| entry: | ||
| call void @blah(), !callsite !19 | ||
| %call = call ptr @_Znam(i64 noundef 10), !memprof !0, !callsite !7 | ||
| %call1 = call ptr @_Znam(i64 noundef 10), !memprof !8, !callsite !15 | ||
| ret void | ||
| } | ||
|
|
||
| ; REMARKS: call in clone _Z1EPPcS0_ assigned to call function clone blah.memprof.1 | ||
| ; REMARKS: call in clone _Z1EPPcS0_.memprof.1 assigned to call function clone blah.memprof.1 | ||
| ; REMARKS: call in clone _Z1EPPcS0_.memprof.2 assigned to call function clone blah.memprof.1 | ||
| ; REMARKS: call in clone _Z1EPPcS0_.memprof.3 assigned to call function clone blah.memprof.1 | ||
|
|
||
| ; IR: define {{.*}} @_Z1EPPcS0_ | ||
| ; IR: call {{.*}} @blah.memprof.1() | ||
| ; IR: define {{.*}} @_Z1EPPcS0_.memprof.2 | ||
| ; IR: call {{.*}} @blah.memprof.1() | ||
| ; IR: define {{.*}} @_Z1EPPcS0_.memprof.3 | ||
| ; IR: call {{.*}} @blah.memprof.1() | ||
|
|
||
| declare ptr @_Znam(i64) | ||
|
|
||
| define internal void @_Z1BPPcS0_() { | ||
| entry: | ||
| call void @_Z1EPPcS0_(ptr null, ptr null), !callsite !16 | ||
| ret void | ||
| } | ||
|
|
||
| define internal void @_Z1CPPcS0_() { | ||
| entry: | ||
| call void @_Z1EPPcS0_(ptr null, ptr null), !callsite !17 | ||
| ret void | ||
| } | ||
|
|
||
| define internal void @_Z1DPPcS0_() { | ||
| entry: | ||
| call void @_Z1EPPcS0_(ptr null, ptr null), !callsite !18 | ||
| ret void | ||
| } | ||
|
|
||
| ; Function Attrs: noinline optnone | ||
| define i32 @main() #0 { | ||
| entry: | ||
| call void @_Z1BPPcS0_() | ||
| call void @_Z1CPPcS0_() | ||
| call void @_Z1DPPcS0_() | ||
| ret i32 0 | ||
| } | ||
|
|
||
| define internal void @blah() #0 { | ||
| entry: | ||
| %call = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #6, !memprof !22, !callsite !21 | ||
| ret void | ||
| } | ||
|
|
||
| define internal void @foo() #0 { | ||
| entry: | ||
| call void @blah(), !callsite !20 | ||
| ret void | ||
| } | ||
|
|
||
| ; uselistorder directives | ||
| uselistorder ptr @_Znam, { 1, 0, 2 } | ||
|
|
||
| attributes #0 = { noinline optnone } | ||
|
|
||
| !0 = !{!1, !3, !5} | ||
| !1 = !{!2, !"cold"} | ||
| !2 = !{i64 -3461278137325233666, i64 -7799663586031895603} | ||
| !3 = !{!4, !"notcold"} | ||
| !4 = !{i64 -3461278137325233666, i64 -3483158674395044949} | ||
| !5 = !{!6, !"notcold"} | ||
| !6 = !{i64 -3461278137325233666, i64 -2441057035866683071} | ||
| !7 = !{i64 -3461278137325233666} | ||
| !8 = !{!9, !11, !13} | ||
| !9 = !{!10, !"notcold"} | ||
| !10 = !{i64 -1415475215210681400, i64 -2441057035866683071} | ||
| !11 = !{!12, !"cold"} | ||
| !12 = !{i64 -1415475215210681400, i64 -3483158674395044949} | ||
| !13 = !{!14, !"notcold"} | ||
| !14 = !{i64 -1415475215210681400, i64 -7799663586031895603} | ||
| !15 = !{i64 -1415475215210681400} | ||
| !16 = !{i64 -2441057035866683071} | ||
| !17 = !{i64 -3483158674395044949} | ||
| !18 = !{i64 -7799663586031895603} | ||
| !19 = !{i64 123} | ||
| !20 = !{i64 234} | ||
| !21 = !{i64 345} | ||
| !22 = !{!23, !25} | ||
| !23 = !{!24, !"cold"} | ||
| !24 = !{i64 345, i64 123} | ||
| !25 = !{!26, !"notcold"} | ||
| !26 = !{i64 345, i64 234} | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,122 @@ | ||
| ;; Similar to funcassigncloning.ll but hand modified to add another allocation | ||
| ;; whose pruned cold context only includes an immediate caller node that itself | ||
| ;; doesn't need cloning, but calls a cloned allocating function, and is in a | ||
| ;; function that gets cloned multiple times for a different callsite. This test | ||
| ;; makes sure the non-cloned callsite is correctly updated in all function | ||
| ;; clones. This case was missed because due to context pruning we don't have any | ||
| ;; caller edges for the first callsite, so the handling that kicks in to | ||
| ;; "reclone" other callsites in cloned functions was being missed. | ||
|
|
||
| ; RUN: opt -passes=memprof-context-disambiguation -supports-hot-cold-new \ | ||
| ; RUN: -memprof-verify-ccg -memprof-verify-nodes \ | ||
| ; RUN: -pass-remarks=memprof-context-disambiguation \ | ||
| ; RUN: %s -S 2>&1 | FileCheck %s --check-prefix=IR --check-prefix=REMARKS | ||
|
|
||
|
|
||
| target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" | ||
| target triple = "x86_64-unknown-linux-gnu" | ||
|
|
||
| ;; Eventually this function will be cloned several times (for the calls to new | ||
| ;; for the various callers). However, function blah() includes an allocation | ||
| ;; whose cold context was trimmed above here. We therefore should assume that | ||
| ;; every caller of this function should call the same version of blah (which | ||
| ;; will be the cloned ".memprof.1" version. | ||
| define internal void @_Z1EPPcS0_(ptr %buf1, ptr %buf2) #0 { | ||
| entry: | ||
| call void @blah(), !callsite !19 | ||
| %call = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #6, !memprof !0, !callsite !7 | ||
| %call1 = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #6, !memprof !8, !callsite !15 | ||
| ret void | ||
| } | ||
|
|
||
| ; REMARKS: created clone blah.memprof.1 | ||
| ; REMARKS: call in clone _Z1EPPcS0_ assigned to call function clone blah.memprof.1 | ||
| ; REMARKS: call in clone _Z1EPPcS0_.memprof.1 assigned to call function clone blah.memprof.1 | ||
| ; REMARKS: call in clone _Z1EPPcS0_.memprof.2 assigned to call function clone blah.memprof.1 | ||
| ; REMARKS: call in clone _Z1EPPcS0_.memprof.3 assigned to call function clone blah.memprof.1 | ||
|
|
||
| ; IR: define {{.*}} @_Z1EPPcS0_ | ||
| ; IR: call {{.*}} @blah.memprof.1() | ||
| ; IR: define {{.*}} @_Z1EPPcS0_.memprof.1 | ||
| ; IR: call {{.*}} @blah.memprof.1() | ||
| ; IR: define {{.*}} @_Z1EPPcS0_.memprof.2 | ||
| ; IR: call {{.*}} @blah.memprof.1() | ||
| ; IR: define {{.*}} @_Z1EPPcS0_.memprof.3 | ||
| ; IR: call {{.*}} @blah.memprof.1() | ||
|
|
||
| declare ptr @_Znam(i64) #1 | ||
|
|
||
| define internal void @_Z1BPPcS0_(ptr %0, ptr %1) { | ||
| entry: | ||
| call void @_Z1EPPcS0_(ptr noundef %0, ptr noundef %1), !callsite !16 | ||
| ret void | ||
| } | ||
|
|
||
| ; Function Attrs: noinline | ||
| define internal void @_Z1CPPcS0_(ptr %0, ptr %1) #2 { | ||
| entry: | ||
| call void @_Z1EPPcS0_(ptr noundef %0, ptr noundef %1), !callsite !17 | ||
| ret void | ||
| } | ||
|
|
||
| define internal void @_Z1DPPcS0_(ptr %0, ptr %1) #3 { | ||
| entry: | ||
| call void @_Z1EPPcS0_(ptr noundef %0, ptr noundef %1), !callsite !18 | ||
| ret void | ||
| } | ||
|
|
||
| define internal void @blah() #0 { | ||
| entry: | ||
| %call = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #6, !memprof !22, !callsite !21 | ||
| ret void | ||
| } | ||
|
|
||
| define internal void @foo() #0 { | ||
| entry: | ||
| call void @blah(), !callsite !20 | ||
| ret void | ||
| } | ||
|
|
||
| ; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: write) | ||
| declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) #4 | ||
|
|
||
| declare i32 @sleep() #5 | ||
|
|
||
| ; uselistorder directives | ||
| uselistorder ptr @_Znam, { 1, 0, 2 } | ||
|
|
||
| attributes #0 = { "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" } | ||
| attributes #1 = { "no-trapping-math"="true" } | ||
| attributes #2 = { noinline } | ||
| attributes #3 = { "frame-pointer"="all" } | ||
| attributes #4 = { nocallback nofree nounwind willreturn memory(argmem: write) } | ||
| attributes #5 = { "disable-tail-calls"="true" } | ||
| attributes #6 = { builtin } | ||
|
|
||
| !0 = !{!1, !3, !5} | ||
| !1 = !{!2, !"cold"} | ||
| !2 = !{i64 -3461278137325233666, i64 -7799663586031895603} | ||
| !3 = !{!4, !"notcold"} | ||
| !4 = !{i64 -3461278137325233666, i64 -3483158674395044949} | ||
| !5 = !{!6, !"notcold"} | ||
| !6 = !{i64 -3461278137325233666, i64 -2441057035866683071} | ||
| !7 = !{i64 -3461278137325233666} | ||
| !8 = !{!9, !11, !13} | ||
| !9 = !{!10, !"notcold"} | ||
| !10 = !{i64 -1415475215210681400, i64 -2441057035866683071} | ||
| !11 = !{!12, !"cold"} | ||
| !12 = !{i64 -1415475215210681400, i64 -3483158674395044949} | ||
| !13 = !{!14, !"notcold"} | ||
| !14 = !{i64 -1415475215210681400, i64 -7799663586031895603} | ||
| !15 = !{i64 -1415475215210681400} | ||
| !16 = !{i64 -2441057035866683071} | ||
| !17 = !{i64 -3483158674395044949} | ||
| !18 = !{i64 -7799663586031895603} | ||
| !19 = !{i64 123} | ||
| !20 = !{i64 234} | ||
| !21 = !{i64 345} | ||
| !22 = !{!23, !25} | ||
| !23 = !{!24, !"cold"} | ||
| !24 = !{i64 345, i64 123} | ||
| !25 = !{!26, !"notcold"} | ||
| !26 = !{i64 345, i64 234} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Does it make sense to enhance the pseudo-code in the function comment with this detail?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done