From 772f768cc2ee09091ec677a3dcd5f3c865e7488f Mon Sep 17 00:00:00 2001 From: Teresa Johnson Date: Thu, 8 May 2025 10:07:43 -0700 Subject: [PATCH 1/2] [MemProf] Optionally discard small non-cold contexts Adds a new option -memprof-callsite-cold-threshold that allows specifying a percent that will cause non-cold contexts to be discarded if the percent cold bytes at a callsite including that context exceeds the given threshold. Default is 100% (no discarding). This reduces the amount of cloning needed to expose cold allocation contexts when parts of the context are dominantly cold. This motivated the change in PR138792, since discarding a context might require a different decision about which not-cold contexts must be kept to expose cloning requirements, so we need to determine that on the fly. Additionally, this required a change to include the context size information in the alloc trie in more cases, so we now guard the inclusion of this information in the generated metadata on the option values. --- .../include/llvm/Analysis/MemoryProfileInfo.h | 3 +- llvm/lib/Analysis/MemoryProfileInfo.cpp | 100 +++++++++-- .../Instrumentation/MemProfiler.cpp | 18 +- .../PGOProfile/memprof_discard_threshold.ll | 163 ++++++++++++++++++ 4 files changed, 258 insertions(+), 26 deletions(-) create mode 100644 llvm/test/Transforms/PGOProfile/memprof_discard_threshold.ll diff --git a/llvm/include/llvm/Analysis/MemoryProfileInfo.h b/llvm/include/llvm/Analysis/MemoryProfileInfo.h index 86c58d1261b71..1d98f86f50484 100644 --- a/llvm/include/llvm/Analysis/MemoryProfileInfo.h +++ b/llvm/include/llvm/Analysis/MemoryProfileInfo.h @@ -103,7 +103,8 @@ class CallStackTrie { bool buildMIBNodes(CallStackTrieNode *Node, LLVMContext &Ctx, std::vector &MIBCallStack, std::vector &MIBNodes, - bool CalleeHasAmbiguousCallerContext); + bool CalleeHasAmbiguousCallerContext, uint64_t &TotalBytes, + uint64_t &ColdBytes); public: CallStackTrie() = default; diff --git a/llvm/lib/Analysis/MemoryProfileInfo.cpp b/llvm/lib/Analysis/MemoryProfileInfo.cpp index 924325b97eaaa..678966313865f 100644 --- a/llvm/lib/Analysis/MemoryProfileInfo.cpp +++ b/llvm/lib/Analysis/MemoryProfileInfo.cpp @@ -13,6 +13,7 @@ #include "llvm/Analysis/MemoryProfileInfo.h" #include "llvm/IR/Constants.h" #include "llvm/Support/CommandLine.h" +#include "llvm/Support/Format.h" using namespace llvm; using namespace llvm::memprof; @@ -58,6 +59,19 @@ cl::opt MemProfKeepAllNotColdContexts( "memprof-keep-all-not-cold-contexts", cl::init(false), cl::Hidden, cl::desc("Keep all non-cold contexts (increases cloning overheads)")); +cl::opt MinClonedColdBytePercent( + "memprof-cloning-cold-threshold", cl::init(100), cl::Hidden, + cl::desc("Min percent of cold bytes to hint alloc cold during cloning")); + +// Discard non-cold contexts if they overlap with much larger cold contexts, +// specifically, if all contexts reaching a given callsite are at least this +// percent cold byte allocations. This reduces the amount of cloning required +// to expose the cold contexts when they greatly dominate non-cold contexts. +cl::opt MinCallsiteColdBytePercent( + "memprof-callsite-cold-threshold", cl::init(100), cl::Hidden, + cl::desc("Min percent of cold bytes at a callsite to discard non-cold " + "contexts")); + AllocationType llvm::memprof::getAllocType(uint64_t TotalLifetimeAccessDensity, uint64_t AllocCount, uint64_t TotalLifetime) { @@ -208,21 +222,32 @@ void CallStackTrie::addCallStack(MDNode *MIB) { static MDNode *createMIBNode(LLVMContext &Ctx, ArrayRef MIBCallStack, AllocationType AllocType, - ArrayRef ContextSizeInfo) { + ArrayRef ContextSizeInfo, + uint64_t &TotalBytes, uint64_t &ColdBytes) { SmallVector MIBPayload( {buildCallstackMetadata(MIBCallStack, Ctx)}); MIBPayload.push_back( MDString::get(Ctx, getAllocTypeAttributeString(AllocType))); if (!ContextSizeInfo.empty()) { for (const auto &[FullStackId, TotalSize] : ContextSizeInfo) { - auto *FullStackIdMD = ValueAsMetadata::get( - ConstantInt::get(Type::getInt64Ty(Ctx), FullStackId)); - auto *TotalSizeMD = ValueAsMetadata::get( - ConstantInt::get(Type::getInt64Ty(Ctx), TotalSize)); - auto *ContextSizeMD = MDNode::get(Ctx, {FullStackIdMD, TotalSizeMD}); - MIBPayload.push_back(ContextSizeMD); + TotalBytes += TotalSize; + if (AllocType == AllocationType::Cold) + ColdBytes += TotalSize; + // Only add the context size info as metadata if we need it in the thin + // link (currently if reporting of hinted sizes is enabled or we have + // specified a threshold for marking allocations cold after cloning). + if (MemProfReportHintedSizes || MinClonedColdBytePercent < 100) { + auto *FullStackIdMD = ValueAsMetadata::get( + ConstantInt::get(Type::getInt64Ty(Ctx), FullStackId)); + auto *TotalSizeMD = ValueAsMetadata::get( + ConstantInt::get(Type::getInt64Ty(Ctx), TotalSize)); + auto *ContextSizeMD = MDNode::get(Ctx, {FullStackIdMD, TotalSizeMD}); + MIBPayload.push_back(ContextSizeMD); + } } } + assert(MinCallsiteColdBytePercent >= 100 || + (!ContextSizeInfo.empty() && TotalBytes > 0)); return MDNode::get(Ctx, MIBPayload); } @@ -246,9 +271,13 @@ void CallStackTrie::convertHotToNotCold(CallStackTrieNode *Node) { // on options that enable filtering out some NotCold contexts. static void saveFilteredNewMIBNodes(std::vector &NewMIBNodes, std::vector &SavedMIBNodes, - unsigned CallerContextLength) { + unsigned CallerContextLength, + uint64_t TotalBytes, uint64_t ColdBytes) { + bool MostlyCold = MinCallsiteColdBytePercent < 100 && + ColdBytes * 100 >= MinCallsiteColdBytePercent * TotalBytes; + // In the simplest case, with pruning disabled, keep all the new MIB nodes. - if (MemProfKeepAllNotColdContexts) { + if (MemProfKeepAllNotColdContexts && !MostlyCold) { append_range(SavedMIBNodes, NewMIBNodes); return; } @@ -271,6 +300,27 @@ static void saveFilteredNewMIBNodes(std::vector &NewMIBNodes, } }; + if (MostlyCold) { + auto NewColdMIBNodes = + make_filter_range(NewMIBNodes, [&](const Metadata *M) { + auto MIBMD = cast(M); + // Only append cold contexts. + if (getMIBAllocType(MIBMD) == AllocationType::Cold) + return true; + if (MemProfReportHintedSizes) { + float PercentCold = ColdBytes * 100.0 / TotalBytes; + std::string PercentStr; + llvm::raw_string_ostream OS(PercentStr); + OS << format(" for %5.2f%% cold bytes", PercentCold); + EmitMessageForRemovedContexts(MIBMD, "discarded", OS.str()); + } + return false; + }); + for (auto *M : NewColdMIBNodes) + SavedMIBNodes.push_back(M); + return; + } + // Prune unneeded NotCold contexts, taking advantage of the fact // that we later will only clone Cold contexts, as NotCold is the allocation // default. We only need to keep as metadata the NotCold contexts that @@ -341,17 +391,20 @@ static void saveFilteredNewMIBNodes(std::vector &NewMIBNodes, // Recursive helper to trim contexts and create metadata nodes. // Caller should have pushed Node's loc to MIBCallStack. Doing this in the // caller makes it simpler to handle the many early returns in this method. +// Updates the total and cold profiled bytes in the subtrie rooted at this node. bool CallStackTrie::buildMIBNodes(CallStackTrieNode *Node, LLVMContext &Ctx, std::vector &MIBCallStack, std::vector &MIBNodes, - bool CalleeHasAmbiguousCallerContext) { + bool CalleeHasAmbiguousCallerContext, + uint64_t &TotalBytes, uint64_t &ColdBytes) { // Trim context below the first node in a prefix with a single alloc type. // Add an MIB record for the current call stack prefix. if (hasSingleAllocType(Node->AllocTypes)) { std::vector ContextSizeInfo; collectContextSizeInfo(Node, ContextSizeInfo); - MIBNodes.push_back(createMIBNode( - Ctx, MIBCallStack, (AllocationType)Node->AllocTypes, ContextSizeInfo)); + MIBNodes.push_back(createMIBNode(Ctx, MIBCallStack, + (AllocationType)Node->AllocTypes, + ContextSizeInfo, TotalBytes, ColdBytes)); return true; } @@ -364,17 +417,25 @@ bool CallStackTrie::buildMIBNodes(CallStackTrieNode *Node, LLVMContext &Ctx, // that will later be filtered before adding to the caller's MIBNodes // vector. std::vector NewMIBNodes; + // Determine the total and cold byte counts for all callers, then add to the + // caller's counts further below. + uint64_t CallerTotalBytes = 0; + uint64_t CallerColdBytes = 0; for (auto &Caller : Node->Callers) { MIBCallStack.push_back(Caller.first); - AddedMIBNodesForAllCallerContexts &= - buildMIBNodes(Caller.second, Ctx, MIBCallStack, NewMIBNodes, - NodeHasAmbiguousCallerContext); + AddedMIBNodesForAllCallerContexts &= buildMIBNodes( + Caller.second, Ctx, MIBCallStack, NewMIBNodes, + NodeHasAmbiguousCallerContext, CallerTotalBytes, CallerColdBytes); // Remove Caller. MIBCallStack.pop_back(); } // Pass in the stack length of the MIB nodes added for the immediate caller, // which is the current stack length plus 1. - saveFilteredNewMIBNodes(NewMIBNodes, MIBNodes, MIBCallStack.size() + 1); + saveFilteredNewMIBNodes(NewMIBNodes, MIBNodes, MIBCallStack.size() + 1, + CallerTotalBytes, CallerColdBytes); + TotalBytes += CallerTotalBytes; + ColdBytes += CallerColdBytes; + if (AddedMIBNodesForAllCallerContexts) return true; // We expect that the callers should be forced to add MIBs to disambiguate @@ -397,7 +458,7 @@ bool CallStackTrie::buildMIBNodes(CallStackTrieNode *Node, LLVMContext &Ctx, std::vector ContextSizeInfo; collectContextSizeInfo(Node, ContextSizeInfo); MIBNodes.push_back(createMIBNode(Ctx, MIBCallStack, AllocationType::NotCold, - ContextSizeInfo)); + ContextSizeInfo, TotalBytes, ColdBytes)); return true; } @@ -444,12 +505,15 @@ bool CallStackTrie::buildAndAttachMIBMetadata(CallBase *CI) { std::vector MIBCallStack; MIBCallStack.push_back(AllocStackId); std::vector MIBNodes; + uint64_t TotalBytes = 0; + uint64_t ColdBytes = 0; assert(!Alloc->Callers.empty() && "addCallStack has not been called yet"); // The CalleeHasAmbiguousCallerContext flag is meant to say whether the // callee of the given node has more than one caller. Here the node being // passed in is the alloc and it has no callees. So it's false. if (buildMIBNodes(Alloc, Ctx, MIBCallStack, MIBNodes, - /*CalleeHasAmbiguousCallerContext=*/false)) { + /*CalleeHasAmbiguousCallerContext=*/false, TotalBytes, + ColdBytes)) { assert(MIBCallStack.size() == 1 && "Should only be left with Alloc's location in stack"); CI->setMetadata(LLVMContext::MD_memprof, MDNode::get(Ctx, MIBNodes)); diff --git a/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp b/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp index e34956da673c3..375ff84f82ed2 100644 --- a/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp +++ b/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp @@ -176,11 +176,9 @@ static cl::opt cl::desc("Salvage stale MemProf profile"), cl::init(false), cl::Hidden); -cl::opt MinClonedColdBytePercent( - "memprof-cloning-cold-threshold", cl::init(100), cl::Hidden, - cl::desc("Min percent of cold bytes to hint alloc cold during cloning")); - extern cl::opt MemProfReportHintedSizes; +extern cl::opt MinClonedColdBytePercent; +extern cl::opt MinCallsiteColdBytePercent; static cl::opt MinMatchedColdBytePercent( "memprof-matching-cold-threshold", cl::init(100), cl::Hidden, @@ -293,6 +291,13 @@ class ModuleMemProfiler { Function *MemProfCtorFunction = nullptr; }; +// Options under which we need to record the context size info in the alloc trie +// used to build metadata. +bool recordContextSizeInfo() { + return MemProfReportHintedSizes || MinClonedColdBytePercent < 100 || + MinCallsiteColdBytePercent < 100; +} + } // end anonymous namespace MemProfilerPass::MemProfilerPass() = default; @@ -758,7 +763,7 @@ static AllocationType addCallStack(CallStackTrie &AllocTrie, AllocInfo->Info.getAllocCount(), AllocInfo->Info.getTotalLifetime()); std::vector ContextSizeInfo; - if (MemProfReportHintedSizes || MinClonedColdBytePercent < 100) { + if (recordContextSizeInfo()) { auto TotalSize = AllocInfo->Info.getTotalSize(); assert(TotalSize); assert(FullStackId != 0); @@ -1141,8 +1146,7 @@ readMemprof(Module &M, Function &F, IndexedInstrProfReader *MemProfReader, InlinedCallStack)) { NumOfMemProfMatchedAllocContexts++; uint64_t FullStackId = 0; - if (ClPrintMemProfMatchInfo || MemProfReportHintedSizes || - MinClonedColdBytePercent < 100) + if (ClPrintMemProfMatchInfo || recordContextSizeInfo()) FullStackId = computeFullStackId(AllocInfo->CallStack); auto AllocType = addCallStack(AllocTrie, AllocInfo, FullStackId); TotalSize += AllocInfo->Info.getTotalSize(); diff --git a/llvm/test/Transforms/PGOProfile/memprof_discard_threshold.ll b/llvm/test/Transforms/PGOProfile/memprof_discard_threshold.ll new file mode 100644 index 0000000000000..a1f8369243888 --- /dev/null +++ b/llvm/test/Transforms/PGOProfile/memprof_discard_threshold.ll @@ -0,0 +1,163 @@ +;; Tests option to discard small noncold contexts. + +;; Avoid failures on big-endian systems that can't read the profile properly +; REQUIRES: x86_64-linux + +;; Generate the profile and the IR. +; RUN: split-file %s %t + +;; Generate indexed profile +; RUN: llvm-profdata merge %t/memprof_discard_threshold.yaml -o %t.memprofdata + +;; Test default (threshold 100%). We should get the same results with and +;; without -memprof-keep-all-not-cold-contexts. + +; RUN: opt < %t/memprof_discard_threshold.ll -passes='memprof-use' -S 2>&1 | FileCheck %s --check-prefixes=CALL,MEMPROF + +;; Test discarding (threshold 80%). We should get the same results with and +;; without -memprof-keep-all-not-cold-contexts. + +; RUN: opt < %t/memprof_discard_threshold.ll -passes='memprof-use' -S -memprof-report-hinted-sizes -memprof-keep-all-not-cold-contexts 2>&1 | FileCheck %s --check-prefixes=CALL,MEMPROF,REPORT + +; RUN: opt < %t/memprof_discard_threshold.ll -passes='memprof-use' -memprof-callsite-cold-threshold=80 -S 2>&1 | FileCheck %s --check-prefixes=CALL,MEMPROF80 + +; RUN: opt < %t/memprof_discard_threshold.ll -passes='memprof-use' -S -memprof-callsite-cold-threshold=80 -memprof-report-hinted-sizes -memprof-keep-all-not-cold-contexts 2>&1 | FileCheck %s --check-prefixes=CALL,MEMPROF80,REPORT80 + +;; One context should have been discarded, with exactly 80-20 behavior. +; REPORT80: MemProf hinting: Total size for discarded non-cold full allocation context hash 7175328747938231822 for 80.00% cold bytes: 20 + +;--- memprof_discard_threshold.yaml +--- +HeapProfileRecords: + - GUID: A + AllocSites: + - Callstack: + - { Function: A, LineOffset: 1, Column: 10, IsInlineFrame: false } + - { Function: B, LineOffset: 6, Column: 13, IsInlineFrame: false } + - { Function: C, LineOffset: 7, Column: 11, IsInlineFrame: false } + # Cold + MemInfoBlock: + AllocCount: 1 + TotalSize: 100 + TotalLifetime: 200000 + TotalLifetimeAccessDensity: 0 + - Callstack: + - { Function: A, LineOffset: 1, Column: 10, IsInlineFrame: false } + - { Function: B, LineOffset: 6, Column: 13, IsInlineFrame: false } + - { Function: D, LineOffset: 8, Column: 20, IsInlineFrame: false } + # Not cold + # While this one is pruned without -memprof-keep-all-not-cold-contexts, + # if we don't track the aggregate total/cold bytes correctly for + # discarded contexts we might think that at callsite B we are more than + # 80% cold and discard all the non-cold contexts. + MemInfoBlock: + AllocCount: 1 + TotalSize: 500 + TotalLifetime: 0 + TotalLifetimeAccessDensity: 20000 + - Callstack: + - { Function: A, LineOffset: 1, Column: 10, IsInlineFrame: false } + - { Function: B, LineOffset: 6, Column: 13, IsInlineFrame: false } + - { Function: E, LineOffset: 5, Column: 4, IsInlineFrame: false } + - { Function: F, LineOffset: 4, Column: 5, IsInlineFrame: false } + # Not cold + MemInfoBlock: + AllocCount: 1 + TotalSize: 30 + TotalLifetime: 0 + TotalLifetimeAccessDensity: 20000 + - Callstack: + - { Function: A, LineOffset: 1, Column: 10, IsInlineFrame: false } + - { Function: B, LineOffset: 6, Column: 13, IsInlineFrame: false } + - { Function: E, LineOffset: 5, Column: 4, IsInlineFrame: false } + - { Function: G, LineOffset: 3, Column: 7, IsInlineFrame: false } + - { Function: H, LineOffset: 2, Column: 15, IsInlineFrame: false } + # Cold + MemInfoBlock: + AllocCount: 1 + TotalSize: 80 + TotalLifetime: 200000 + TotalLifetimeAccessDensity: 0 + - Callstack: + - { Function: A, LineOffset: 1, Column: 10, IsInlineFrame: false } + - { Function: B, LineOffset: 6, Column: 13, IsInlineFrame: false } + - { Function: E, LineOffset: 5, Column: 4, IsInlineFrame: false } + - { Function: G, LineOffset: 3, Column: 7, IsInlineFrame: false } + - { Function: I, LineOffset: 7, Column: 16, IsInlineFrame: false } + # Not cold + MemInfoBlock: + AllocCount: 1 + TotalSize: 20 + TotalLifetime: 0 + TotalLifetimeAccessDensity: 20000 + CallSites: [] + +;--- memprof_discard_threshold.ll +; ModuleID = 'memprof_discard_threshold.cc' +source_filename = "memprof_discard_threshold.cc" +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +define dso_local noundef ptr @A() !dbg !10 { +entry: + ; CALL: call {{.*}} @_Znam{{.*}} !memprof ![[M1:[0-9]+]], !callsite ![[C1:[0-9]+]] + %call = call noalias noundef nonnull ptr @_Znam(i64 noundef 32), !dbg !13 + ret ptr %call +} + +declare noundef nonnull ptr @_Znam(i64 noundef) + +; MEMPROF: ![[M1]] = !{![[MIB1:[0-9]+]], +; REPORT-SAME: ![[MIB2:[0-9]+]], +; MEMPROF-SAME: ![[MIB3:[0-9]+]], ![[MIB4:[0-9]+]] +; REPORT-SAME: , ![[MIB5:[0-9]+]]} +; MEMPROF: ![[MIB1]] = !{![[STACK1:[0-9]+]], !"cold" +; REPORT-SAME: , ![[SIZE1:[0-9]+]]} +; MEMPROF: ![[STACK1]] = !{i64 -2647357475745718070, i64 869302454322824036, i64 705268496523263927} +; REPORT: ![[SIZE1]] = !{i64 -7154823362113119138, i64 100} +; REPORT: ![[MIB2]] = !{![[STACK2:[0-9]+]], !"notcold", ![[SIZE2:[0-9]+]]} +; REPORT: ![[STACK2]] = !{i64 -2647357475745718070, i64 869302454322824036, i64 -6015552466537626283, i64 7621414677325196405} +; REPORT: ![[SIZE2]] = !{i64 4574148894276641937, i64 30} +; MEMPROF: ![[MIB3]] = !{![[STACK3:[0-9]+]], !"cold" +; REPORT-SAME: , ![[SIZE3:[0-9]+]]} +; MEMPROF: ![[STACK3]] = !{i64 -2647357475745718070, i64 869302454322824036, i64 -6015552466537626283, i64 -2897722569788633560, i64 3206456850862191843} +; REPORT: ![[SIZE3]] = !{i64 2848517899452258040, i64 80} +; MEMPROF: ![[MIB4]] = !{![[STACK4:[0-9]+]], !"notcold" +; REPORT-SAME: , ![[SIZE4:[0-9]+]]} +; MEMPROF: ![[STACK4]] = !{i64 -2647357475745718070, i64 869302454322824036, i64 -6015552466537626283, i64 -2897722569788633560, i64 -1037739922429764316} +; REPORT: ![[SIZE4]] = !{i64 7175328747938231822, i64 20} +; REPORT: ![[MIB5]] = !{![[STACK5:[0-9]+]], !"notcold", ![[SIZE5:[0-9]+]]} +; REPORT: ![[STACK5]] = !{i64 -2647357475745718070, i64 869302454322824036, i64 -3111772584478263690} +; REPORT: ![[SIZE5]] = !{i64 8469515017747579284, i64 500} +; MEMPROF: ![[C1]] = !{i64 -2647357475745718070} + +; MEMPROF80: ![[M1]] = !{![[MIB1:[0-9]+]], ![[MIB2:[0-9]+]], ![[MIB3:[0-9]+]] +; REPORT80-SAME: , ![[MIB5:[0-9]+]]} +; MEMPROF80: ![[MIB1]] = !{![[STACK1:[0-9]+]], !"cold" +; REPORT80-SAME: , ![[SIZE1:[0-9]+]]} +; MEMPROF80: ![[STACK1]] = !{i64 -2647357475745718070, i64 869302454322824036, i64 705268496523263927} +; REPORT80: ![[SIZE1]] = !{i64 -7154823362113119138, i64 100} +; MEMPROF80: ![[MIB2]] = !{![[STACK2:[0-9]+]], !"notcold" +; REPORT80-SAME: , ![[SIZE2:[0-9]+]]} +; MEMPROF80: ![[STACK2]] = !{i64 -2647357475745718070, i64 869302454322824036, i64 -6015552466537626283, i64 7621414677325196405} +; REPORT80: ![[SIZE2]] = !{i64 4574148894276641937, i64 30} +; MEMPROF80: ![[MIB3]] = !{![[STACK3:[0-9]+]], !"cold" +; REPORT80-SAME: , ![[SIZE3:[0-9]+]]} +; MEMPROF80: ![[STACK3]] = !{i64 -2647357475745718070, i64 869302454322824036, i64 -6015552466537626283, i64 -2897722569788633560, i64 3206456850862191843} +; REPORT80: ![[SIZE3]] = !{i64 2848517899452258040, i64 80} +; REPORT80: ![[MIB5]] = !{![[STACK5:[0-9]+]], !"notcold", ![[SIZE5:[0-9]+]]} +; REPORT80: ![[STACK5]] = !{i64 -2647357475745718070, i64 869302454322824036, i64 -3111772584478263690} +; REPORT80: ![[SIZE5]] = !{i64 8469515017747579284, i64 500} +; MEMPROF80: ![[C1]] = !{i64 -2647357475745718070} + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 15.0.0 (https://github.com/llvm/llvm-project.git 6cbe6284d1f0a088b5c6482ae27b738f03d82fe7)", isOptimized: false, runtimeVersion: 0, emissionKind: LineTablesOnly, splitDebugInlining: false, debugInfoForProfiling: true, nameTableKind: None) +!1 = !DIFile(filename: "memprof.cc", directory: ".", checksumkind: CSK_MD5, checksum: "e8c40ebe4b21776b4d60e9632cbc13c2") +!2 = !{i32 7, !"Dwarf Version", i32 5} +!3 = !{i32 2, !"Debug Info Version", i32 3} +!10 = distinct !DISubprogram(name: "A", linkageName: "A", scope: !1, file: !1, line: 4, type: !11, scopeLine: 4, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !12) +!11 = !DISubroutineType(types: !12) +!12 = !{} +!13 = !DILocation(line: 5, column: 10, scope: !10) From 63902b26e7d6f85e720a1ec4aac3e5c6fa0250a8 Mon Sep 17 00:00:00 2001 From: Teresa Johnson Date: Fri, 9 May 2025 13:32:06 -0700 Subject: [PATCH 2/2] Address comments --- llvm/lib/Analysis/MemoryProfileInfo.cpp | 52 +++++++++++++++---------- 1 file changed, 31 insertions(+), 21 deletions(-) diff --git a/llvm/lib/Analysis/MemoryProfileInfo.cpp b/llvm/lib/Analysis/MemoryProfileInfo.cpp index 678966313865f..f9145854784f2 100644 --- a/llvm/lib/Analysis/MemoryProfileInfo.cpp +++ b/llvm/lib/Analysis/MemoryProfileInfo.cpp @@ -228,26 +228,32 @@ static MDNode *createMIBNode(LLVMContext &Ctx, ArrayRef MIBCallStack, {buildCallstackMetadata(MIBCallStack, Ctx)}); MIBPayload.push_back( MDString::get(Ctx, getAllocTypeAttributeString(AllocType))); - if (!ContextSizeInfo.empty()) { - for (const auto &[FullStackId, TotalSize] : ContextSizeInfo) { - TotalBytes += TotalSize; - if (AllocType == AllocationType::Cold) - ColdBytes += TotalSize; - // Only add the context size info as metadata if we need it in the thin - // link (currently if reporting of hinted sizes is enabled or we have - // specified a threshold for marking allocations cold after cloning). - if (MemProfReportHintedSizes || MinClonedColdBytePercent < 100) { - auto *FullStackIdMD = ValueAsMetadata::get( - ConstantInt::get(Type::getInt64Ty(Ctx), FullStackId)); - auto *TotalSizeMD = ValueAsMetadata::get( - ConstantInt::get(Type::getInt64Ty(Ctx), TotalSize)); - auto *ContextSizeMD = MDNode::get(Ctx, {FullStackIdMD, TotalSizeMD}); - MIBPayload.push_back(ContextSizeMD); - } + + if (ContextSizeInfo.empty()) { + // The profile matcher should have provided context size info if there was a + // MinCallsiteColdBytePercent < 100. Here we check >=100 to gracefully + // handle a user-provided percent larger than 100. + assert(MinCallsiteColdBytePercent >= 100); + return MDNode::get(Ctx, MIBPayload); + } + + for (const auto &[FullStackId, TotalSize] : ContextSizeInfo) { + TotalBytes += TotalSize; + if (AllocType == AllocationType::Cold) + ColdBytes += TotalSize; + // Only add the context size info as metadata if we need it in the thin + // link (currently if reporting of hinted sizes is enabled or we have + // specified a threshold for marking allocations cold after cloning). + if (MemProfReportHintedSizes || MinClonedColdBytePercent < 100) { + auto *FullStackIdMD = ValueAsMetadata::get( + ConstantInt::get(Type::getInt64Ty(Ctx), FullStackId)); + auto *TotalSizeMD = ValueAsMetadata::get( + ConstantInt::get(Type::getInt64Ty(Ctx), TotalSize)); + auto *ContextSizeMD = MDNode::get(Ctx, {FullStackIdMD, TotalSizeMD}); + MIBPayload.push_back(ContextSizeMD); } } - assert(MinCallsiteColdBytePercent >= 100 || - (!ContextSizeInfo.empty() && TotalBytes > 0)); + assert(TotalBytes > 0); return MDNode::get(Ctx, MIBPayload); } @@ -273,8 +279,9 @@ static void saveFilteredNewMIBNodes(std::vector &NewMIBNodes, std::vector &SavedMIBNodes, unsigned CallerContextLength, uint64_t TotalBytes, uint64_t ColdBytes) { - bool MostlyCold = MinCallsiteColdBytePercent < 100 && - ColdBytes * 100 >= MinCallsiteColdBytePercent * TotalBytes; + const bool MostlyCold = + MinCallsiteColdBytePercent < 100 && + ColdBytes * 100 >= MinCallsiteColdBytePercent * TotalBytes; // In the simplest case, with pruning disabled, keep all the new MIB nodes. if (MemProfKeepAllNotColdContexts && !MostlyCold) { @@ -300,6 +307,9 @@ static void saveFilteredNewMIBNodes(std::vector &NewMIBNodes, } }; + // If the cold bytes at the current callsite exceed the given threshold, we + // discard all non-cold contexts so do not need any of the later pruning + // handling. We can simply copy over all the cold contexts and return early. if (MostlyCold) { auto NewColdMIBNodes = make_filter_range(NewMIBNodes, [&](const Metadata *M) { @@ -308,7 +318,7 @@ static void saveFilteredNewMIBNodes(std::vector &NewMIBNodes, if (getMIBAllocType(MIBMD) == AllocationType::Cold) return true; if (MemProfReportHintedSizes) { - float PercentCold = ColdBytes * 100.0 / TotalBytes; + const float PercentCold = ColdBytes * 100.0 / TotalBytes; std::string PercentStr; llvm::raw_string_ostream OS(PercentStr); OS << format(" for %5.2f%% cold bytes", PercentCold);