Skip to content
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
91 changes: 81 additions & 10 deletions llvm/lib/Transforms/Instrumentation/MemProfiler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,12 @@ static cl::opt<bool>
cl::desc("Salvage stale MemProf profile"),
cl::init(false), cl::Hidden);

static cl::opt<bool> ClMemProfAttachCalleeGuids(
"memprof-attach-calleeguids",
cl::desc(
"Attach calleeguids as value profile metadata for indirect calls."),
cl::init(true), cl::Hidden);

extern cl::opt<bool> MemProfReportHintedSizes;
extern cl::opt<unsigned> MinClonedColdBytePercent;
extern cl::opt<unsigned> MinCallsiteColdBytePercent;
Expand Down Expand Up @@ -964,6 +970,44 @@ undriftMemProfRecord(const DenseMap<uint64_t, LocToLocMap> &UndriftMaps,
UndriftCallStack(CS.Frames);
}

// Helper function to process CalleeGuids and create value profile metadata
static void addVPMetadata(Module &M, Instruction &I,
ArrayRef<GlobalValue::GUID> CalleeGuids) {
if (!ClMemProfAttachCalleeGuids || CalleeGuids.empty())
return;

if (I.getMetadata(LLVMContext::MD_prof)) {
uint64_t Unused;
// ~0U means get all available value profile data without any count limit
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This seems like overkill, why do we need them all given that we quit below if it is non-empty? At least for now why not just set this to 1? For eventual merging, it could be increased to one of the max values for this call in ICP. If you change this add a TODO

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point, done.

auto ExistingVD = getValueProfDataFromInst(I, IPVK_IndirectCallTarget,
/*MaxNumValueData=*/~0U, Unused);
// We don't know how to merge value profile data yet.
if (!ExistingVD.empty()) {
return;
}
}

SmallVector<InstrProfValueData, 4> VDs;
uint64_t TotalCount = 0;

for (const GlobalValue::GUID CalleeGUID : CalleeGuids) {
InstrProfValueData VD;
VD.Value = CalleeGUID;
// For MemProf, we don't have actual call counts, so we assign
// a weight of 1 to each potential target.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ideally this could be used to drive ICP too, but I'm not sure a weight of 1 will be hot enough. Should it either be a configurable value, or add a FIXME or TODO to adjust this to allow for ICP?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added TODO.

// TODO: Consider making this weight configurable or increasing it to
// improve effectiveness for ICP.
VD.Count = 1;
VDs.push_back(VD);
TotalCount += VD.Count;
}

if (!VDs.empty()) {
annotateValueSite(M, I, VDs, TotalCount, IPVK_IndirectCallTarget,
VDs.size());
}
}

static void
readMemprof(Module &M, Function &F, IndexedInstrProfReader *MemProfReader,
const TargetLibraryInfo &TLI,
Expand Down Expand Up @@ -1031,15 +1075,35 @@ readMemprof(Module &M, Function &F, IndexedInstrProfReader *MemProfReader,
// Build maps of the location hash to all profile data with that leaf location
// (allocation info and the callsites).
std::map<uint64_t, std::set<const AllocationInfo *>> LocHashToAllocInfo;
// A hash function for std::unordered_set<ArrayRef<Frame>> to work.
struct CallStackHash {
size_t operator()(ArrayRef<Frame> CS) const {
return computeFullStackId(CS);

// Helper struct for maintaining refs to callsite data. As an alternative we
// could store a pointer to the CallSiteInfo struct but we also need the frame
// index. Using ArrayRefs instead makes it a little easier to read.
struct CallSiteEntry {
// Subset of frames for the corresponding CallSiteInfo.
ArrayRef<Frame> Frames;
// Potential targets for indirect calls.
ArrayRef<GlobalValue::GUID> CalleeGuids;

// Only compare Frame contents.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe comment why you aren't using ArrayRef operator==. I guess that does elementwise equality and here you are looking for the underlying pointer being the same?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, added comment. It's also faster this way.

// Use pointer-based equality instead of ArrayRef's operator== which does
// element-wise comparison. We want to check if it's the same slice of the
// underlying array, not just equivalent content.
bool operator==(const CallSiteEntry &Other) const {
return Frames.data() == Other.Frames.data() &&
Frames.size() == Other.Frames.size();
}
};

struct CallSiteEntryHash {
size_t operator()(const CallSiteEntry &Entry) const {
return computeFullStackId(Entry.Frames);
}
};

// For the callsites we need to record slices of the frame array (see comments
// below where the map entries are added).
std::map<uint64_t, std::unordered_set<ArrayRef<Frame>, CallStackHash>>
// below where the map entries are added) along with their CalleeGuids.
std::map<uint64_t, std::unordered_set<CallSiteEntry, CallSiteEntryHash>>
LocHashToCallSites;
for (auto &AI : MemProfRec->AllocSites) {
NumOfMemProfAllocContextProfiles++;
Expand All @@ -1057,8 +1121,10 @@ readMemprof(Module &M, Function &F, IndexedInstrProfReader *MemProfReader,
unsigned Idx = 0;
for (auto &StackFrame : CS.Frames) {
uint64_t StackId = computeStackId(StackFrame);
LocHashToCallSites[StackId].insert(
ArrayRef<Frame>(CS.Frames).drop_front(Idx++));
ArrayRef<Frame> FrameSlice = ArrayRef<Frame>(CS.Frames).drop_front(Idx++);
ArrayRef<GlobalValue::GUID> CalleeGuids(CS.CalleeGuids);
LocHashToCallSites[StackId].insert({FrameSlice, CalleeGuids});

ProfileHasColumns |= StackFrame.Column;
// Once we find this function, we can stop recording.
if (StackFrame.Function == FuncGUID)
Expand Down Expand Up @@ -1202,13 +1268,18 @@ readMemprof(Module &M, Function &F, IndexedInstrProfReader *MemProfReader,
// Otherwise, add callsite metadata. If we reach here then we found the
// instruction's leaf location in the callsites map and not the allocation
// map.
for (auto CallStackIdx : CallSitesIter->second) {
for (const auto &CallSiteEntry : CallSitesIter->second) {
// If we found and thus matched all frames on the call, create and
// attach call stack metadata.
if (stackFrameIncludesInlinedCallStack(CallStackIdx,
if (stackFrameIncludesInlinedCallStack(CallSiteEntry.Frames,
InlinedCallStack)) {
NumOfMemProfMatchedCallSites++;
addCallsiteMetadata(I, InlinedCallStack, Ctx);

// Try to attach indirect call metadata if possible.
if (!CalledFunction)
addVPMetadata(M, I, CallSiteEntry.CalleeGuids);

// Only need to find one with a matching call stack and add a single
// callsite metadata.

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
; RUN: split-file %s %t

;; Basic functionality with flag toggle
; RUN: llvm-profdata merge --memprof-version=4 %t/basic.yaml -o %t/basic.memprofdata
; RUN: opt < %t/basic.ll -passes='memprof-use<profile-filename=%t/basic.memprofdata>' -memprof-attach-calleeguids=false -S 2>&1 | FileCheck %s --check-prefix=CHECK-DISABLE
; RUN: opt < %t/basic.ll -passes='memprof-use<profile-filename=%t/basic.memprofdata>' -memprof-attach-calleeguids=true -S 2>&1 | FileCheck %s --check-prefix=CHECK-ENABLE

;; FDO conflict handling
; RUN: llvm-profdata merge --memprof-version=4 %t/fdo_conflict.yaml -o %t/fdo_conflict.memprofdata
; RUN: opt < %t/fdo_conflict.ll -passes='memprof-use<profile-filename=%t/fdo_conflict.memprofdata>' -memprof-attach-calleeguids=true -S 2>&1 | FileCheck %s --check-prefix=CHECK-FDO

;--- basic.yaml
---
HeapProfileRecords:
- GUID: _Z3barv
AllocSites: []
CallSites:
- Frames:
- { Function: _Z3barv, LineOffset: 3, Column: 5, IsInlineFrame: false }
CalleeGuids: [0x123456789abcdef0, 0x23456789abcdef01]
...

;--- basic.ll
define dso_local void @_Z3barv() !dbg !4 {
entry:
%fp = alloca ptr, align 8
%0 = load ptr, ptr %fp, align 8
call void %0(), !dbg !5
; CHECK-ENABLE: call void %0(), {{.*}} !prof !6
; CHECK-DISABLE-NOT: !prof
ret void
}

; CHECK-ENABLE: !6 = !{!"VP", i32 0, i64 2, i64 1311768467463790320, i64 1, i64 2541551405711093505, i64 1}

!llvm.module.flags = !{!2, !3}

!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1)
!1 = !DIFile(filename: "t", directory: "/")
!2 = !{i32 7, !"Dwarf Version", i32 5}
!3 = !{i32 2, !"Debug Info Version", i32 3}
!4 = distinct !DISubprogram(name: "bar", linkageName: "_Z3barv", scope: !1, file: !1, line: 1, unit: !0)
!5 = !DILocation(line: 4, column: 5, scope: !4)

;--- fdo_conflict.yaml
---
HeapProfileRecords:
- GUID: _Z3foov
AllocSites: []
CallSites:
- Frames:
- { Function: _Z3foov, LineOffset: 3, Column: 5, IsInlineFrame: false }
CalleeGuids: [0x123456789abcdef0, 0x23456789abcdef01]
- Frames:
- { Function: _Z3foov, LineOffset: 5, Column: 5, IsInlineFrame: false }
CalleeGuids: [0x555556789abcdef0, 0x666656789abcdef1]
...

;--- fdo_conflict.ll
define dso_local void @_Z3foov() !dbg !14 {
entry:
%fp = alloca ptr, align 8
%0 = load ptr, ptr %fp, align 8
; This call already has FDO value profile metadata - should NOT be modified
; CHECK-FDO: call void %0(), {{.*}} !prof !6
call void %0(), !dbg !15, !prof !16

%1 = load ptr, ptr %fp, align 8
; This call does NOT have existing metadata - should get MemProf annotation
; CHECK-FDO: call void %1(), {{.*}} !prof !9
call void %1(), !dbg !17
ret void
}

!16 = !{!"VP", i32 0, i64 100, i64 9191153033785521275, i64 80, i64 -1069303473483922844, i64 20}

; CHECK-FDO: !6 = !{!"VP", i32 0, i64 100, i64 9191153033785521275, i64 80, i64 -1069303473483922844, i64 20}
; CHECK-FDO: !9 = !{!"VP", i32 0, i64 2, i64 6148915942236413680, i64 1, i64 7378680115485269745, i64 1}

!llvm.module.flags = !{!12, !13}

!10 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !11)
!11 = !DIFile(filename: "t", directory: "/")
!12 = !{i32 7, !"Dwarf Version", i32 5}
!13 = !{i32 2, !"Debug Info Version", i32 3}
!14 = distinct !DISubprogram(name: "foo", linkageName: "_Z3foov", scope: !11, file: !11, line: 1, unit: !10)
!15 = !DILocation(line: 4, column: 5, scope: !14)
!17 = !DILocation(line: 6, column: 5, scope: !14)