diff --git a/llvm/include/llvm/Analysis/MemoryProfileInfo.h b/llvm/include/llvm/Analysis/MemoryProfileInfo.h index 86c58d1261b71..1d98f86f50484 100644 --- a/llvm/include/llvm/Analysis/MemoryProfileInfo.h +++ b/llvm/include/llvm/Analysis/MemoryProfileInfo.h @@ -103,7 +103,8 @@ class CallStackTrie { bool buildMIBNodes(CallStackTrieNode *Node, LLVMContext &Ctx, std::vector &MIBCallStack, std::vector &MIBNodes, - bool CalleeHasAmbiguousCallerContext); + bool CalleeHasAmbiguousCallerContext, uint64_t &TotalBytes, + uint64_t &ColdBytes); public: CallStackTrie() = default; diff --git a/llvm/lib/Analysis/MemoryProfileInfo.cpp b/llvm/lib/Analysis/MemoryProfileInfo.cpp index 924325b97eaaa..f9145854784f2 100644 --- a/llvm/lib/Analysis/MemoryProfileInfo.cpp +++ b/llvm/lib/Analysis/MemoryProfileInfo.cpp @@ -13,6 +13,7 @@ #include "llvm/Analysis/MemoryProfileInfo.h" #include "llvm/IR/Constants.h" #include "llvm/Support/CommandLine.h" +#include "llvm/Support/Format.h" using namespace llvm; using namespace llvm::memprof; @@ -58,6 +59,19 @@ cl::opt MemProfKeepAllNotColdContexts( "memprof-keep-all-not-cold-contexts", cl::init(false), cl::Hidden, cl::desc("Keep all non-cold contexts (increases cloning overheads)")); +cl::opt MinClonedColdBytePercent( + "memprof-cloning-cold-threshold", cl::init(100), cl::Hidden, + cl::desc("Min percent of cold bytes to hint alloc cold during cloning")); + +// Discard non-cold contexts if they overlap with much larger cold contexts, +// specifically, if all contexts reaching a given callsite are at least this +// percent cold byte allocations. This reduces the amount of cloning required +// to expose the cold contexts when they greatly dominate non-cold contexts. +cl::opt MinCallsiteColdBytePercent( + "memprof-callsite-cold-threshold", cl::init(100), cl::Hidden, + cl::desc("Min percent of cold bytes at a callsite to discard non-cold " + "contexts")); + AllocationType llvm::memprof::getAllocType(uint64_t TotalLifetimeAccessDensity, uint64_t AllocCount, uint64_t TotalLifetime) { @@ -208,13 +222,29 @@ void CallStackTrie::addCallStack(MDNode *MIB) { static MDNode *createMIBNode(LLVMContext &Ctx, ArrayRef MIBCallStack, AllocationType AllocType, - ArrayRef ContextSizeInfo) { + ArrayRef ContextSizeInfo, + uint64_t &TotalBytes, uint64_t &ColdBytes) { SmallVector MIBPayload( {buildCallstackMetadata(MIBCallStack, Ctx)}); MIBPayload.push_back( MDString::get(Ctx, getAllocTypeAttributeString(AllocType))); - if (!ContextSizeInfo.empty()) { - for (const auto &[FullStackId, TotalSize] : ContextSizeInfo) { + + if (ContextSizeInfo.empty()) { + // The profile matcher should have provided context size info if there was a + // MinCallsiteColdBytePercent < 100. Here we check >=100 to gracefully + // handle a user-provided percent larger than 100. + assert(MinCallsiteColdBytePercent >= 100); + return MDNode::get(Ctx, MIBPayload); + } + + for (const auto &[FullStackId, TotalSize] : ContextSizeInfo) { + TotalBytes += TotalSize; + if (AllocType == AllocationType::Cold) + ColdBytes += TotalSize; + // Only add the context size info as metadata if we need it in the thin + // link (currently if reporting of hinted sizes is enabled or we have + // specified a threshold for marking allocations cold after cloning). + if (MemProfReportHintedSizes || MinClonedColdBytePercent < 100) { auto *FullStackIdMD = ValueAsMetadata::get( ConstantInt::get(Type::getInt64Ty(Ctx), FullStackId)); auto *TotalSizeMD = ValueAsMetadata::get( @@ -223,6 +253,7 @@ static MDNode *createMIBNode(LLVMContext &Ctx, ArrayRef MIBCallStack, MIBPayload.push_back(ContextSizeMD); } } + assert(TotalBytes > 0); return MDNode::get(Ctx, MIBPayload); } @@ -246,9 +277,14 @@ void CallStackTrie::convertHotToNotCold(CallStackTrieNode *Node) { // on options that enable filtering out some NotCold contexts. static void saveFilteredNewMIBNodes(std::vector &NewMIBNodes, std::vector &SavedMIBNodes, - unsigned CallerContextLength) { + unsigned CallerContextLength, + uint64_t TotalBytes, uint64_t ColdBytes) { + const bool MostlyCold = + MinCallsiteColdBytePercent < 100 && + ColdBytes * 100 >= MinCallsiteColdBytePercent * TotalBytes; + // In the simplest case, with pruning disabled, keep all the new MIB nodes. - if (MemProfKeepAllNotColdContexts) { + if (MemProfKeepAllNotColdContexts && !MostlyCold) { append_range(SavedMIBNodes, NewMIBNodes); return; } @@ -271,6 +307,30 @@ static void saveFilteredNewMIBNodes(std::vector &NewMIBNodes, } }; + // If the cold bytes at the current callsite exceed the given threshold, we + // discard all non-cold contexts so do not need any of the later pruning + // handling. We can simply copy over all the cold contexts and return early. + if (MostlyCold) { + auto NewColdMIBNodes = + make_filter_range(NewMIBNodes, [&](const Metadata *M) { + auto MIBMD = cast(M); + // Only append cold contexts. + if (getMIBAllocType(MIBMD) == AllocationType::Cold) + return true; + if (MemProfReportHintedSizes) { + const float PercentCold = ColdBytes * 100.0 / TotalBytes; + std::string PercentStr; + llvm::raw_string_ostream OS(PercentStr); + OS << format(" for %5.2f%% cold bytes", PercentCold); + EmitMessageForRemovedContexts(MIBMD, "discarded", OS.str()); + } + return false; + }); + for (auto *M : NewColdMIBNodes) + SavedMIBNodes.push_back(M); + return; + } + // Prune unneeded NotCold contexts, taking advantage of the fact // that we later will only clone Cold contexts, as NotCold is the allocation // default. We only need to keep as metadata the NotCold contexts that @@ -341,17 +401,20 @@ static void saveFilteredNewMIBNodes(std::vector &NewMIBNodes, // Recursive helper to trim contexts and create metadata nodes. // Caller should have pushed Node's loc to MIBCallStack. Doing this in the // caller makes it simpler to handle the many early returns in this method. +// Updates the total and cold profiled bytes in the subtrie rooted at this node. bool CallStackTrie::buildMIBNodes(CallStackTrieNode *Node, LLVMContext &Ctx, std::vector &MIBCallStack, std::vector &MIBNodes, - bool CalleeHasAmbiguousCallerContext) { + bool CalleeHasAmbiguousCallerContext, + uint64_t &TotalBytes, uint64_t &ColdBytes) { // Trim context below the first node in a prefix with a single alloc type. // Add an MIB record for the current call stack prefix. if (hasSingleAllocType(Node->AllocTypes)) { std::vector ContextSizeInfo; collectContextSizeInfo(Node, ContextSizeInfo); - MIBNodes.push_back(createMIBNode( - Ctx, MIBCallStack, (AllocationType)Node->AllocTypes, ContextSizeInfo)); + MIBNodes.push_back(createMIBNode(Ctx, MIBCallStack, + (AllocationType)Node->AllocTypes, + ContextSizeInfo, TotalBytes, ColdBytes)); return true; } @@ -364,17 +427,25 @@ bool CallStackTrie::buildMIBNodes(CallStackTrieNode *Node, LLVMContext &Ctx, // that will later be filtered before adding to the caller's MIBNodes // vector. std::vector NewMIBNodes; + // Determine the total and cold byte counts for all callers, then add to the + // caller's counts further below. + uint64_t CallerTotalBytes = 0; + uint64_t CallerColdBytes = 0; for (auto &Caller : Node->Callers) { MIBCallStack.push_back(Caller.first); - AddedMIBNodesForAllCallerContexts &= - buildMIBNodes(Caller.second, Ctx, MIBCallStack, NewMIBNodes, - NodeHasAmbiguousCallerContext); + AddedMIBNodesForAllCallerContexts &= buildMIBNodes( + Caller.second, Ctx, MIBCallStack, NewMIBNodes, + NodeHasAmbiguousCallerContext, CallerTotalBytes, CallerColdBytes); // Remove Caller. MIBCallStack.pop_back(); } // Pass in the stack length of the MIB nodes added for the immediate caller, // which is the current stack length plus 1. - saveFilteredNewMIBNodes(NewMIBNodes, MIBNodes, MIBCallStack.size() + 1); + saveFilteredNewMIBNodes(NewMIBNodes, MIBNodes, MIBCallStack.size() + 1, + CallerTotalBytes, CallerColdBytes); + TotalBytes += CallerTotalBytes; + ColdBytes += CallerColdBytes; + if (AddedMIBNodesForAllCallerContexts) return true; // We expect that the callers should be forced to add MIBs to disambiguate @@ -397,7 +468,7 @@ bool CallStackTrie::buildMIBNodes(CallStackTrieNode *Node, LLVMContext &Ctx, std::vector ContextSizeInfo; collectContextSizeInfo(Node, ContextSizeInfo); MIBNodes.push_back(createMIBNode(Ctx, MIBCallStack, AllocationType::NotCold, - ContextSizeInfo)); + ContextSizeInfo, TotalBytes, ColdBytes)); return true; } @@ -444,12 +515,15 @@ bool CallStackTrie::buildAndAttachMIBMetadata(CallBase *CI) { std::vector MIBCallStack; MIBCallStack.push_back(AllocStackId); std::vector MIBNodes; + uint64_t TotalBytes = 0; + uint64_t ColdBytes = 0; assert(!Alloc->Callers.empty() && "addCallStack has not been called yet"); // The CalleeHasAmbiguousCallerContext flag is meant to say whether the // callee of the given node has more than one caller. Here the node being // passed in is the alloc and it has no callees. So it's false. if (buildMIBNodes(Alloc, Ctx, MIBCallStack, MIBNodes, - /*CalleeHasAmbiguousCallerContext=*/false)) { + /*CalleeHasAmbiguousCallerContext=*/false, TotalBytes, + ColdBytes)) { assert(MIBCallStack.size() == 1 && "Should only be left with Alloc's location in stack"); CI->setMetadata(LLVMContext::MD_memprof, MDNode::get(Ctx, MIBNodes)); diff --git a/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp b/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp index e34956da673c3..375ff84f82ed2 100644 --- a/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp +++ b/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp @@ -176,11 +176,9 @@ static cl::opt cl::desc("Salvage stale MemProf profile"), cl::init(false), cl::Hidden); -cl::opt MinClonedColdBytePercent( - "memprof-cloning-cold-threshold", cl::init(100), cl::Hidden, - cl::desc("Min percent of cold bytes to hint alloc cold during cloning")); - extern cl::opt MemProfReportHintedSizes; +extern cl::opt MinClonedColdBytePercent; +extern cl::opt MinCallsiteColdBytePercent; static cl::opt MinMatchedColdBytePercent( "memprof-matching-cold-threshold", cl::init(100), cl::Hidden, @@ -293,6 +291,13 @@ class ModuleMemProfiler { Function *MemProfCtorFunction = nullptr; }; +// Options under which we need to record the context size info in the alloc trie +// used to build metadata. +bool recordContextSizeInfo() { + return MemProfReportHintedSizes || MinClonedColdBytePercent < 100 || + MinCallsiteColdBytePercent < 100; +} + } // end anonymous namespace MemProfilerPass::MemProfilerPass() = default; @@ -758,7 +763,7 @@ static AllocationType addCallStack(CallStackTrie &AllocTrie, AllocInfo->Info.getAllocCount(), AllocInfo->Info.getTotalLifetime()); std::vector ContextSizeInfo; - if (MemProfReportHintedSizes || MinClonedColdBytePercent < 100) { + if (recordContextSizeInfo()) { auto TotalSize = AllocInfo->Info.getTotalSize(); assert(TotalSize); assert(FullStackId != 0); @@ -1141,8 +1146,7 @@ readMemprof(Module &M, Function &F, IndexedInstrProfReader *MemProfReader, InlinedCallStack)) { NumOfMemProfMatchedAllocContexts++; uint64_t FullStackId = 0; - if (ClPrintMemProfMatchInfo || MemProfReportHintedSizes || - MinClonedColdBytePercent < 100) + if (ClPrintMemProfMatchInfo || recordContextSizeInfo()) FullStackId = computeFullStackId(AllocInfo->CallStack); auto AllocType = addCallStack(AllocTrie, AllocInfo, FullStackId); TotalSize += AllocInfo->Info.getTotalSize(); diff --git a/llvm/test/Transforms/PGOProfile/memprof_discard_threshold.ll b/llvm/test/Transforms/PGOProfile/memprof_discard_threshold.ll new file mode 100644 index 0000000000000..a1f8369243888 --- /dev/null +++ b/llvm/test/Transforms/PGOProfile/memprof_discard_threshold.ll @@ -0,0 +1,163 @@ +;; Tests option to discard small noncold contexts. + +;; Avoid failures on big-endian systems that can't read the profile properly +; REQUIRES: x86_64-linux + +;; Generate the profile and the IR. +; RUN: split-file %s %t + +;; Generate indexed profile +; RUN: llvm-profdata merge %t/memprof_discard_threshold.yaml -o %t.memprofdata + +;; Test default (threshold 100%). We should get the same results with and +;; without -memprof-keep-all-not-cold-contexts. + +; RUN: opt < %t/memprof_discard_threshold.ll -passes='memprof-use' -S 2>&1 | FileCheck %s --check-prefixes=CALL,MEMPROF + +;; Test discarding (threshold 80%). We should get the same results with and +;; without -memprof-keep-all-not-cold-contexts. + +; RUN: opt < %t/memprof_discard_threshold.ll -passes='memprof-use' -S -memprof-report-hinted-sizes -memprof-keep-all-not-cold-contexts 2>&1 | FileCheck %s --check-prefixes=CALL,MEMPROF,REPORT + +; RUN: opt < %t/memprof_discard_threshold.ll -passes='memprof-use' -memprof-callsite-cold-threshold=80 -S 2>&1 | FileCheck %s --check-prefixes=CALL,MEMPROF80 + +; RUN: opt < %t/memprof_discard_threshold.ll -passes='memprof-use' -S -memprof-callsite-cold-threshold=80 -memprof-report-hinted-sizes -memprof-keep-all-not-cold-contexts 2>&1 | FileCheck %s --check-prefixes=CALL,MEMPROF80,REPORT80 + +;; One context should have been discarded, with exactly 80-20 behavior. +; REPORT80: MemProf hinting: Total size for discarded non-cold full allocation context hash 7175328747938231822 for 80.00% cold bytes: 20 + +;--- memprof_discard_threshold.yaml +--- +HeapProfileRecords: + - GUID: A + AllocSites: + - Callstack: + - { Function: A, LineOffset: 1, Column: 10, IsInlineFrame: false } + - { Function: B, LineOffset: 6, Column: 13, IsInlineFrame: false } + - { Function: C, LineOffset: 7, Column: 11, IsInlineFrame: false } + # Cold + MemInfoBlock: + AllocCount: 1 + TotalSize: 100 + TotalLifetime: 200000 + TotalLifetimeAccessDensity: 0 + - Callstack: + - { Function: A, LineOffset: 1, Column: 10, IsInlineFrame: false } + - { Function: B, LineOffset: 6, Column: 13, IsInlineFrame: false } + - { Function: D, LineOffset: 8, Column: 20, IsInlineFrame: false } + # Not cold + # While this one is pruned without -memprof-keep-all-not-cold-contexts, + # if we don't track the aggregate total/cold bytes correctly for + # discarded contexts we might think that at callsite B we are more than + # 80% cold and discard all the non-cold contexts. + MemInfoBlock: + AllocCount: 1 + TotalSize: 500 + TotalLifetime: 0 + TotalLifetimeAccessDensity: 20000 + - Callstack: + - { Function: A, LineOffset: 1, Column: 10, IsInlineFrame: false } + - { Function: B, LineOffset: 6, Column: 13, IsInlineFrame: false } + - { Function: E, LineOffset: 5, Column: 4, IsInlineFrame: false } + - { Function: F, LineOffset: 4, Column: 5, IsInlineFrame: false } + # Not cold + MemInfoBlock: + AllocCount: 1 + TotalSize: 30 + TotalLifetime: 0 + TotalLifetimeAccessDensity: 20000 + - Callstack: + - { Function: A, LineOffset: 1, Column: 10, IsInlineFrame: false } + - { Function: B, LineOffset: 6, Column: 13, IsInlineFrame: false } + - { Function: E, LineOffset: 5, Column: 4, IsInlineFrame: false } + - { Function: G, LineOffset: 3, Column: 7, IsInlineFrame: false } + - { Function: H, LineOffset: 2, Column: 15, IsInlineFrame: false } + # Cold + MemInfoBlock: + AllocCount: 1 + TotalSize: 80 + TotalLifetime: 200000 + TotalLifetimeAccessDensity: 0 + - Callstack: + - { Function: A, LineOffset: 1, Column: 10, IsInlineFrame: false } + - { Function: B, LineOffset: 6, Column: 13, IsInlineFrame: false } + - { Function: E, LineOffset: 5, Column: 4, IsInlineFrame: false } + - { Function: G, LineOffset: 3, Column: 7, IsInlineFrame: false } + - { Function: I, LineOffset: 7, Column: 16, IsInlineFrame: false } + # Not cold + MemInfoBlock: + AllocCount: 1 + TotalSize: 20 + TotalLifetime: 0 + TotalLifetimeAccessDensity: 20000 + CallSites: [] + +;--- memprof_discard_threshold.ll +; ModuleID = 'memprof_discard_threshold.cc' +source_filename = "memprof_discard_threshold.cc" +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +define dso_local noundef ptr @A() !dbg !10 { +entry: + ; CALL: call {{.*}} @_Znam{{.*}} !memprof ![[M1:[0-9]+]], !callsite ![[C1:[0-9]+]] + %call = call noalias noundef nonnull ptr @_Znam(i64 noundef 32), !dbg !13 + ret ptr %call +} + +declare noundef nonnull ptr @_Znam(i64 noundef) + +; MEMPROF: ![[M1]] = !{![[MIB1:[0-9]+]], +; REPORT-SAME: ![[MIB2:[0-9]+]], +; MEMPROF-SAME: ![[MIB3:[0-9]+]], ![[MIB4:[0-9]+]] +; REPORT-SAME: , ![[MIB5:[0-9]+]]} +; MEMPROF: ![[MIB1]] = !{![[STACK1:[0-9]+]], !"cold" +; REPORT-SAME: , ![[SIZE1:[0-9]+]]} +; MEMPROF: ![[STACK1]] = !{i64 -2647357475745718070, i64 869302454322824036, i64 705268496523263927} +; REPORT: ![[SIZE1]] = !{i64 -7154823362113119138, i64 100} +; REPORT: ![[MIB2]] = !{![[STACK2:[0-9]+]], !"notcold", ![[SIZE2:[0-9]+]]} +; REPORT: ![[STACK2]] = !{i64 -2647357475745718070, i64 869302454322824036, i64 -6015552466537626283, i64 7621414677325196405} +; REPORT: ![[SIZE2]] = !{i64 4574148894276641937, i64 30} +; MEMPROF: ![[MIB3]] = !{![[STACK3:[0-9]+]], !"cold" +; REPORT-SAME: , ![[SIZE3:[0-9]+]]} +; MEMPROF: ![[STACK3]] = !{i64 -2647357475745718070, i64 869302454322824036, i64 -6015552466537626283, i64 -2897722569788633560, i64 3206456850862191843} +; REPORT: ![[SIZE3]] = !{i64 2848517899452258040, i64 80} +; MEMPROF: ![[MIB4]] = !{![[STACK4:[0-9]+]], !"notcold" +; REPORT-SAME: , ![[SIZE4:[0-9]+]]} +; MEMPROF: ![[STACK4]] = !{i64 -2647357475745718070, i64 869302454322824036, i64 -6015552466537626283, i64 -2897722569788633560, i64 -1037739922429764316} +; REPORT: ![[SIZE4]] = !{i64 7175328747938231822, i64 20} +; REPORT: ![[MIB5]] = !{![[STACK5:[0-9]+]], !"notcold", ![[SIZE5:[0-9]+]]} +; REPORT: ![[STACK5]] = !{i64 -2647357475745718070, i64 869302454322824036, i64 -3111772584478263690} +; REPORT: ![[SIZE5]] = !{i64 8469515017747579284, i64 500} +; MEMPROF: ![[C1]] = !{i64 -2647357475745718070} + +; MEMPROF80: ![[M1]] = !{![[MIB1:[0-9]+]], ![[MIB2:[0-9]+]], ![[MIB3:[0-9]+]] +; REPORT80-SAME: , ![[MIB5:[0-9]+]]} +; MEMPROF80: ![[MIB1]] = !{![[STACK1:[0-9]+]], !"cold" +; REPORT80-SAME: , ![[SIZE1:[0-9]+]]} +; MEMPROF80: ![[STACK1]] = !{i64 -2647357475745718070, i64 869302454322824036, i64 705268496523263927} +; REPORT80: ![[SIZE1]] = !{i64 -7154823362113119138, i64 100} +; MEMPROF80: ![[MIB2]] = !{![[STACK2:[0-9]+]], !"notcold" +; REPORT80-SAME: , ![[SIZE2:[0-9]+]]} +; MEMPROF80: ![[STACK2]] = !{i64 -2647357475745718070, i64 869302454322824036, i64 -6015552466537626283, i64 7621414677325196405} +; REPORT80: ![[SIZE2]] = !{i64 4574148894276641937, i64 30} +; MEMPROF80: ![[MIB3]] = !{![[STACK3:[0-9]+]], !"cold" +; REPORT80-SAME: , ![[SIZE3:[0-9]+]]} +; MEMPROF80: ![[STACK3]] = !{i64 -2647357475745718070, i64 869302454322824036, i64 -6015552466537626283, i64 -2897722569788633560, i64 3206456850862191843} +; REPORT80: ![[SIZE3]] = !{i64 2848517899452258040, i64 80} +; REPORT80: ![[MIB5]] = !{![[STACK5:[0-9]+]], !"notcold", ![[SIZE5:[0-9]+]]} +; REPORT80: ![[STACK5]] = !{i64 -2647357475745718070, i64 869302454322824036, i64 -3111772584478263690} +; REPORT80: ![[SIZE5]] = !{i64 8469515017747579284, i64 500} +; MEMPROF80: ![[C1]] = !{i64 -2647357475745718070} + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 15.0.0 (https://github.com/llvm/llvm-project.git 6cbe6284d1f0a088b5c6482ae27b738f03d82fe7)", isOptimized: false, runtimeVersion: 0, emissionKind: LineTablesOnly, splitDebugInlining: false, debugInfoForProfiling: true, nameTableKind: None) +!1 = !DIFile(filename: "memprof.cc", directory: ".", checksumkind: CSK_MD5, checksum: "e8c40ebe4b21776b4d60e9632cbc13c2") +!2 = !{i32 7, !"Dwarf Version", i32 5} +!3 = !{i32 2, !"Debug Info Version", i32 3} +!10 = distinct !DISubprogram(name: "A", linkageName: "A", scope: !1, file: !1, line: 4, type: !11, scopeLine: 4, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !12) +!11 = !DISubroutineType(types: !12) +!12 = !{} +!13 = !DILocation(line: 5, column: 10, scope: !10)