Skip to content

Commit e3905a4

Browse files
[MemProf] Merge all callee guids for indirect call VP metadata (#170964)
When matching memprof profiles, for indirect calls we use the callee guids recorded on callsites in the profile to synthesize indirect call VP metadata when none exists. However, we only do this for the first matching CallSiteEntry from the profile. In some case there can be multiple, for example when the current function was eventually inlined into multiple callers. Profile generation propagates the CallSiteEntry from those callers into the inlined callee's profile as it may not yet have been inlined in the new compile. To capture all of these potential indirect call targets, merge callee guids across all matching CallSiteEntries.
1 parent 1e16f4e commit e3905a4

File tree

2 files changed

+55
-44
lines changed

2 files changed

+55
-44
lines changed

llvm/lib/Transforms/Instrumentation/MemProfUse.cpp

Lines changed: 41 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -509,59 +509,59 @@ struct CallSiteEntry {
509509
ArrayRef<Frame> Frames;
510510
// Potential targets for indirect calls.
511511
ArrayRef<GlobalValue::GUID> CalleeGuids;
512-
513-
// Only compare Frame contents.
514-
// Use pointer-based equality instead of ArrayRef's operator== which does
515-
// element-wise comparison. We want to check if it's the same slice of the
516-
// underlying array, not just equivalent content.
517-
bool operator==(const CallSiteEntry &Other) const {
518-
return Frames.data() == Other.Frames.data() &&
519-
Frames.size() == Other.Frames.size();
520-
}
521512
};
522513

523-
struct CallSiteEntryHash {
524-
size_t operator()(const CallSiteEntry &Entry) const {
525-
return computeFullStackId(Entry.Frames);
526-
}
527-
};
528-
529-
static void handleCallSite(
530-
Instruction &I, const Function *CalledFunction,
531-
ArrayRef<uint64_t> InlinedCallStack,
532-
const std::unordered_set<CallSiteEntry, CallSiteEntryHash> &CallSiteEntries,
533-
Module &M, std::set<std::vector<uint64_t>> &MatchedCallSites,
534-
OptimizationRemarkEmitter &ORE) {
514+
static void handleCallSite(Instruction &I, const Function *CalledFunction,
515+
ArrayRef<uint64_t> InlinedCallStack,
516+
const std::vector<CallSiteEntry> &CallSiteEntries,
517+
Module &M,
518+
std::set<std::vector<uint64_t>> &MatchedCallSites,
519+
OptimizationRemarkEmitter &ORE) {
535520
auto &Ctx = M.getContext();
521+
// Set of Callee GUIDs to attach to indirect calls. We accumulate all of them
522+
// to support cases where the instuction's inlined frames match multiple call
523+
// site entries, which can happen if the profile was collected from a binary
524+
// where this instruction was eventually inlined into multiple callers.
525+
SetVector<GlobalValue::GUID> CalleeGuids;
526+
bool CallsiteMDAdded = false;
536527
for (const auto &CallSiteEntry : CallSiteEntries) {
537528
// If we found and thus matched all frames on the call, create and
538529
// attach call stack metadata.
539530
if (stackFrameIncludesInlinedCallStack(CallSiteEntry.Frames,
540531
InlinedCallStack)) {
541532
NumOfMemProfMatchedCallSites++;
542-
addCallsiteMetadata(I, InlinedCallStack, Ctx);
543-
544-
// Try to attach indirect call metadata if possible.
545-
if (!CalledFunction)
546-
addVPMetadata(M, I, CallSiteEntry.CalleeGuids);
547-
548533
// Only need to find one with a matching call stack and add a single
549534
// callsite metadata.
550-
551-
// Accumulate call site matching information upon request.
552-
if (ClPrintMemProfMatchInfo) {
553-
std::vector<uint64_t> CallStack;
554-
append_range(CallStack, InlinedCallStack);
555-
MatchedCallSites.insert(std::move(CallStack));
535+
if (!CallsiteMDAdded) {
536+
addCallsiteMetadata(I, InlinedCallStack, Ctx);
537+
538+
// Accumulate call site matching information upon request.
539+
if (ClPrintMemProfMatchInfo) {
540+
std::vector<uint64_t> CallStack;
541+
append_range(CallStack, InlinedCallStack);
542+
MatchedCallSites.insert(std::move(CallStack));
543+
}
544+
ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemProfUse", &I)
545+
<< ore::NV("CallSite", &I) << " in function "
546+
<< ore::NV("Caller", I.getFunction())
547+
<< " matched callsite with frame count "
548+
<< ore::NV("Frames", InlinedCallStack.size()));
549+
550+
// If this is a direct call, we're done.
551+
if (CalledFunction)
552+
break;
553+
CallsiteMDAdded = true;
556554
}
557-
ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemProfUse", &I)
558-
<< ore::NV("CallSite", &I) << " in function "
559-
<< ore::NV("Caller", I.getFunction())
560-
<< " matched callsite with frame count "
561-
<< ore::NV("Frames", InlinedCallStack.size()));
562-
break;
555+
556+
assert(!CalledFunction && "Didn't expect direct call");
557+
558+
// Collect Callee GUIDs from all matching CallSiteEntries.
559+
CalleeGuids.insert(CallSiteEntry.CalleeGuids.begin(),
560+
CallSiteEntry.CalleeGuids.end());
563561
}
564562
}
563+
// Try to attach indirect call metadata if possible.
564+
addVPMetadata(M, I, CalleeGuids.getArrayRef());
565565
}
566566

567567
static void readMemprof(Module &M, Function &F,
@@ -639,8 +639,7 @@ static void readMemprof(Module &M, Function &F,
639639

640640
// For the callsites we need to record slices of the frame array (see comments
641641
// below where the map entries are added) along with their CalleeGuids.
642-
std::map<uint64_t, std::unordered_set<CallSiteEntry, CallSiteEntryHash>>
643-
LocHashToCallSites;
642+
std::map<uint64_t, std::vector<CallSiteEntry>> LocHashToCallSites;
644643
for (auto &AI : MemProfRec->AllocSites) {
645644
NumOfMemProfAllocContextProfiles++;
646645
// Associate the allocation info with the leaf frame. The later matching
@@ -659,7 +658,7 @@ static void readMemprof(Module &M, Function &F,
659658
uint64_t StackId = computeStackId(StackFrame);
660659
ArrayRef<Frame> FrameSlice = ArrayRef<Frame>(CS.Frames).drop_front(Idx++);
661660
ArrayRef<GlobalValue::GUID> CalleeGuids(CS.CalleeGuids);
662-
LocHashToCallSites[StackId].insert({FrameSlice, CalleeGuids});
661+
LocHashToCallSites[StackId].push_back({FrameSlice, CalleeGuids});
663662

664663
ProfileHasColumns |= StackFrame.Column;
665664
// Once we find this function, we can stop recording.

llvm/test/Transforms/PGOProfile/memprof_annotate_indirect_call.test

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
;; Basic functionality with flag toggle
44
; RUN: llvm-profdata merge --memprof-version=4 %t/basic.yaml -o %t/basic.memprofdata
55
; RUN: opt < %t/basic.ll -passes='memprof-use<profile-filename=%t/basic.memprofdata>' -memprof-attach-calleeguids=false -S 2>&1 | FileCheck %s --check-prefix=CHECK-DISABLE
6-
; RUN: opt < %t/basic.ll -passes='memprof-use<profile-filename=%t/basic.memprofdata>' -memprof-attach-calleeguids=true -S 2>&1 | FileCheck %s --check-prefix=CHECK-ENABLE
6+
; RUN: opt < %t/basic.ll -passes='memprof-use<profile-filename=%t/basic.memprofdata>' -memprof-attach-calleeguids=true -S 2>&1 | FileCheck %s --check-prefix=CHECK-ENABLE --dump-input-filter=all
77

88
;; FDO conflict handling
99
; RUN: llvm-profdata merge --memprof-version=4 %t/fdo_conflict.yaml -o %t/fdo_conflict.memprofdata
@@ -18,6 +18,18 @@ HeapProfileRecords:
1818
- Frames:
1919
- { Function: _Z3barv, LineOffset: 3, Column: 5, IsInlineFrame: false }
2020
CalleeGuids: [0x123456789abcdef0, 0x23456789abcdef01]
21+
# The next 2 sets of frames simulates the case where this function was
22+
# eventually inlined into multiple callers. We would have propagated the
23+
# resulting frames and callee guids here for matching with they not yet
24+
# inlined bar. We should aggregate all callee guids into the metadata.
25+
- Frames:
26+
- { Function: _Z3barv, LineOffset: 3, Column: 5, IsInlineFrame: true }
27+
- { Function: _Z3foov, LineOffset: 1, Column: 6, IsInlineFrame: false }
28+
CalleeGuids: [0x1234, 0x2345]
29+
- Frames:
30+
- { Function: _Z3barv, LineOffset: 3, Column: 5, IsInlineFrame: true }
31+
- { Function: _Z3foov, LineOffset: 10, Column: 7, IsInlineFrame: false }
32+
CalleeGuids: [0x3456, 0x4567]
2133
...
2234

2335
;--- basic.ll
@@ -31,7 +43,7 @@ entry:
3143
ret void
3244
}
3345

34-
; CHECK-ENABLE: !6 = !{!"VP", i32 0, i64 2, i64 1311768467463790320, i64 1, i64 2541551405711093505, i64 1}
46+
; CHECK-ENABLE: !6 = !{!"VP", i32 0, i64 6, i64 1311768467463790320, i64 1, i64 2541551405711093505, i64 1, i64 4660, i64 1, i64 9029, i64 1, i64 13398, i64 1, i64 17767, i64 1}
3547

3648
!llvm.module.flags = !{!2, !3}
3749

0 commit comments

Comments
 (0)