From a4d1c1244bd7445ec468cdcab350eba4b0266fe3 Mon Sep 17 00:00:00 2001 From: Teresa Johnson Date: Mon, 2 Jun 2025 17:20:19 -0700 Subject: [PATCH 1/3] [MemProf] Optionally save context size info on largest cold allocations In order to allow selective reporting of context hinting during the LTO link, and in the future to allow selective more aggressive cloning, add an option to specify a minimum percent of the max cold size in the profile summary. Contexts that meet that threshold will get context size info metadata (and ThinLTO summary information) on the associated allocations. Specifying -memprof-report-hinted-sizes during the pre-LTO compile step will continue to cause all contexts to receive this metadata. But specifying -memprof-report-hinted-sizes only during the LTO link will cause only those that meet the new threshold and have the metadata to get reported. To support this, because the alloc info summary and associated bitcode requires the context size information to be in the same order as the other context information, 0s are inserted for contexts without this metadata. The bitcode writer uses a more compact format for the context ids to allow better compression of the 0s. As part of this change several helper methods are added to query whether metadata contains context size info on any or all contexts. --- .../include/llvm/Analysis/MemoryProfileInfo.h | 19 ++- llvm/lib/Analysis/MemoryProfileInfo.cpp | 48 +++++-- llvm/lib/Analysis/ModuleSummaryAnalysis.cpp | 17 ++- llvm/lib/Bitcode/Reader/BitcodeReader.cpp | 8 ++ llvm/lib/Bitcode/Writer/BitcodeWriter.cpp | 26 ++-- .../IPO/MemProfContextDisambiguation.cpp | 5 +- .../Instrumentation/MemProfiler.cpp | 25 ++-- .../X86/memprof-report-hinted-partial.ll | 73 ++++++++++ .../memprof_max_cold_threshold.test | 131 ++++++++++++++++++ 9 files changed, 314 insertions(+), 38 deletions(-) create mode 100644 llvm/test/ThinLTO/X86/memprof-report-hinted-partial.ll create mode 100644 llvm/test/Transforms/PGOProfile/memprof_max_cold_threshold.test diff --git a/llvm/include/llvm/Analysis/MemoryProfileInfo.h b/llvm/include/llvm/Analysis/MemoryProfileInfo.h index 8cbb8673b69f5..b042a717e4e49 100644 --- a/llvm/include/llvm/Analysis/MemoryProfileInfo.h +++ b/llvm/include/llvm/Analysis/MemoryProfileInfo.h @@ -24,6 +24,18 @@ class OptimizationRemarkEmitter; namespace memprof { +/// Whether the alloc memeprof metadata will include context size info for all +/// MIBs. +LLVM_ABI bool metadataIncludesAllContextSizeInfo(); + +/// Whether the alloc memprof metadata may include context size info for some +/// MIBs (but possibly not all). +LLVM_ABI bool metadataMayIncludeContextSizeInfo(); + +/// Whether we need to record the context size info in the alloc trie used to +/// build metadata. +LLVM_ABI bool recordContextSizeInfoForAnalysis(); + /// Build callstack metadata from the provided list of call stack ids. Returns /// the resulting metadata node. LLVM_ABI MDNode *buildCallstackMetadata(ArrayRef CallStack, @@ -87,6 +99,9 @@ class CallStackTrie { // allocations for which we apply non-context sensitive allocation hints. OptimizationRemarkEmitter *ORE; + // The maximum size of a cold allocation context, from the profile summary. + uint64_t MaxColdSize; + void deleteTrieNode(CallStackTrieNode *Node) { if (!Node) return; @@ -113,7 +128,9 @@ class CallStackTrie { uint64_t &ColdBytes); public: - CallStackTrie(OptimizationRemarkEmitter *ORE = nullptr) : ORE(ORE) {} + CallStackTrie(OptimizationRemarkEmitter *ORE = nullptr, + uint64_t MaxColdSize = 0) + : ORE(ORE), MaxColdSize(MaxColdSize) {} ~CallStackTrie() { deleteTrieNode(Alloc); } bool empty() const { return Alloc == nullptr; } diff --git a/llvm/lib/Analysis/MemoryProfileInfo.cpp b/llvm/lib/Analysis/MemoryProfileInfo.cpp index 347377522101a..2ea765be5dbe6 100644 --- a/llvm/lib/Analysis/MemoryProfileInfo.cpp +++ b/llvm/lib/Analysis/MemoryProfileInfo.cpp @@ -46,6 +46,25 @@ cl::opt MinCallsiteColdBytePercent( cl::desc("Min percent of cold bytes at a callsite to discard non-cold " "contexts")); +// Enable saving context size information for largest cold contexts, which can +// be used to flag contexts for more aggressive cloning and reporting. +cl::opt MinPercentMaxColdSize( + "memprof-min-percent-max-cold-size", cl::init(100), cl::Hidden, + cl::desc("Min percent of max cold bytes for critical cold context")); + +bool llvm::memprof::metadataIncludesAllContextSizeInfo() { + return MemProfReportHintedSizes || MinClonedColdBytePercent < 100; +} + +bool llvm::memprof::metadataMayIncludeContextSizeInfo() { + return metadataIncludesAllContextSizeInfo() || MinPercentMaxColdSize < 100; +} + +bool llvm::memprof::recordContextSizeInfoForAnalysis() { + return metadataMayIncludeContextSizeInfo() || + MinCallsiteColdBytePercent < 100; +} + MDNode *llvm::memprof::buildCallstackMetadata(ArrayRef CallStack, LLVMContext &Ctx) { SmallVector StackVals; @@ -168,7 +187,8 @@ void CallStackTrie::addCallStack(MDNode *MIB) { static MDNode *createMIBNode(LLVMContext &Ctx, ArrayRef MIBCallStack, AllocationType AllocType, ArrayRef ContextSizeInfo, - uint64_t &TotalBytes, uint64_t &ColdBytes) { + uint64_t &TotalBytes, uint64_t &ColdBytes, + uint64_t MaxColdSize) { SmallVector MIBPayload( {buildCallstackMetadata(MIBCallStack, Ctx)}); MIBPayload.push_back( @@ -184,12 +204,21 @@ static MDNode *createMIBNode(LLVMContext &Ctx, ArrayRef MIBCallStack, for (const auto &[FullStackId, TotalSize] : ContextSizeInfo) { TotalBytes += TotalSize; - if (AllocType == AllocationType::Cold) + bool LargeColdContext = false; + if (AllocType == AllocationType::Cold) { ColdBytes += TotalSize; + // If we have the max cold context size from summary information and have + // requested identification of contexts above a percentage of the max, see + // if this context qualifies. + if (MaxColdSize > 0 && MinPercentMaxColdSize < 100 && + TotalSize * 100 >= MaxColdSize * MinPercentMaxColdSize) + LargeColdContext = true; + } // Only add the context size info as metadata if we need it in the thin - // link (currently if reporting of hinted sizes is enabled or we have - // specified a threshold for marking allocations cold after cloning). - if (MemProfReportHintedSizes || MinClonedColdBytePercent < 100) { + // link (currently if reporting of hinted sizes is enabled, we have + // specified a threshold for marking allocations cold after cloning, or we + // have identified this as a large cold context of interest above). + if (metadataIncludesAllContextSizeInfo() || LargeColdContext) { auto *FullStackIdMD = ValueAsMetadata::get( ConstantInt::get(Type::getInt64Ty(Ctx), FullStackId)); auto *TotalSizeMD = ValueAsMetadata::get( @@ -357,9 +386,9 @@ bool CallStackTrie::buildMIBNodes(CallStackTrieNode *Node, LLVMContext &Ctx, if (hasSingleAllocType(Node->AllocTypes)) { std::vector ContextSizeInfo; collectContextSizeInfo(Node, ContextSizeInfo); - MIBNodes.push_back(createMIBNode(Ctx, MIBCallStack, - (AllocationType)Node->AllocTypes, - ContextSizeInfo, TotalBytes, ColdBytes)); + MIBNodes.push_back( + createMIBNode(Ctx, MIBCallStack, (AllocationType)Node->AllocTypes, + ContextSizeInfo, TotalBytes, ColdBytes, MaxColdSize)); return true; } @@ -413,7 +442,8 @@ bool CallStackTrie::buildMIBNodes(CallStackTrieNode *Node, LLVMContext &Ctx, std::vector ContextSizeInfo; collectContextSizeInfo(Node, ContextSizeInfo); MIBNodes.push_back(createMIBNode(Ctx, MIBCallStack, AllocationType::NotCold, - ContextSizeInfo, TotalBytes, ColdBytes)); + ContextSizeInfo, TotalBytes, ColdBytes, + MaxColdSize)); return true; } diff --git a/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp b/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp index 59fa1a4b03c37..332122700c886 100644 --- a/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp +++ b/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp @@ -544,7 +544,8 @@ static void computeFunctionSummary( } // If we have context size information, collect it for inclusion in // the summary. - assert(MIBMD->getNumOperands() > 2 || !MemProfReportHintedSizes); + assert(MIBMD->getNumOperands() > 2 || + !metadataIncludesAllContextSizeInfo()); if (MIBMD->getNumOperands() > 2) { std::vector ContextSizes; for (unsigned I = 2; I < MIBMD->getNumOperands(); I++) { @@ -558,7 +559,21 @@ static void computeFunctionSummary( ->getZExtValue(); ContextSizes.push_back({FullStackId, TS}); } + // The ContextSizeInfos must be in the same relative position as the + // associated MIB. In some cases we only include a ContextSizeInfo + // for a subset of MIBs in an allocation. In those cases we insert + // 0s for the other MIBs. Handle the case where the first + // ContextSizeInfo being inserted is not for the first MIB, insert + // a pair of 0s for each of the prior MIBs. + if (ContextSizeInfos.empty() && !MIBs.empty()) + ContextSizeInfos.insert(ContextSizeInfos.begin(), MIBs.size(), + {{0, 0}}); ContextSizeInfos.push_back(std::move(ContextSizes)); + } else if (!ContextSizeInfos.empty()) { + // See earlier comment about handling case of ContextSizeInfos only + // for a subset of MIBs. Insert a pair of 0s for this MIB as it does + // not have a ContextSizeInfo but other MIBs did. + ContextSizeInfos.push_back({{0, 0}}); } MIBs.push_back( MIBInfo(getMIBAllocType(MIBMD), std::move(StackIdIndices))); diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp index ce7b1ef65eed7..04840b50fa272 100644 --- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp +++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp @@ -8164,6 +8164,14 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) { ContextSizes.reserve(NumContextSizeInfoEntries); for (unsigned J = 0; J < NumContextSizeInfoEntries; J++) { assert(ContextIdIndex < PendingContextIds.size()); + // Skip any 0 entries for MIBs without the context size info. + if (PendingContextIds[ContextIdIndex] == 0) { + // The size should also be 0 if the context was 0. + assert(!Record[I]); + ContextIdIndex++; + I++; + continue; + } // PendingContextIds read from the preceding FS_ALLOC_CONTEXT_IDS // should be in the same order as the total sizes. ContextSizes.push_back( diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp index f8748babb1625..ec4dd3d148d99 100644 --- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp +++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp @@ -23,6 +23,7 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringMap.h" #include "llvm/ADT/StringRef.h" +#include "llvm/Analysis/MemoryProfileInfo.h" #include "llvm/BinaryFormat/Dwarf.h" #include "llvm/Bitcode/BitcodeCommon.h" #include "llvm/Bitcode/BitcodeReader.h" @@ -4584,14 +4585,23 @@ void ModuleBitcodeWriterBase::writePerModuleGlobalValueSummary() { Stream.EmitRecord(bitc::FS_STACK_IDS, Vals, StackIdAbbvId); } - // n x context id - auto ContextIdAbbv = std::make_shared(); - ContextIdAbbv->Add(BitCodeAbbrevOp(bitc::FS_ALLOC_CONTEXT_IDS)); - ContextIdAbbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array)); - // The context ids are hashes that are close to 64 bits in size, so emitting - // as a pair of 32-bit fixed-width values is more efficient than a VBR. - ContextIdAbbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 32)); - unsigned ContextIdAbbvId = Stream.EmitAbbrev(std::move(ContextIdAbbv)); + unsigned ContextIdAbbvId = 0; + if (metadataMayIncludeContextSizeInfo()) { + // n x context id + auto ContextIdAbbv = std::make_shared(); + ContextIdAbbv->Add(BitCodeAbbrevOp(bitc::FS_ALLOC_CONTEXT_IDS)); + ContextIdAbbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array)); + // The context ids are hashes that are close to 64 bits in size, so emitting + // as a pair of 32-bit fixed-width values is more efficient than a VBR if we + // are emitting them for all MIBs. Otherwise we use VBR to better compress 0 + // values that are expected to more frequently occur in an alloc's memprof + // summary. + if (metadataIncludesAllContextSizeInfo()) + ContextIdAbbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 32)); + else + ContextIdAbbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); + ContextIdAbbvId = Stream.EmitAbbrev(std::move(ContextIdAbbv)); + } // Abbrev for FS_PERMODULE_PROFILE. Abbv = std::make_shared(); diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp index 5b4350845b726..cff38a8e68c6a 100644 --- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp +++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp @@ -2232,9 +2232,8 @@ IndexCallsiteContextGraph::IndexCallsiteContextGraph( CallStack::const_iterator> EmptyContext; unsigned I = 0; - assert( - (!MemProfReportHintedSizes && MinClonedColdBytePercent >= 100) || - AN.ContextSizeInfos.size() == AN.MIBs.size()); + assert(!metadataMayIncludeContextSizeInfo() || + AN.ContextSizeInfos.size() == AN.MIBs.size()); // Now add all of the MIBs and their stack nodes. for (auto &MIB : AN.MIBs) { CallStack::const_iterator> diff --git a/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp b/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp index c03aa5accc011..76f935b377525 100644 --- a/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp +++ b/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp @@ -178,10 +178,6 @@ static cl::opt cl::desc("Salvage stale MemProf profile"), cl::init(false), cl::Hidden); -extern cl::opt MemProfReportHintedSizes; -extern cl::opt MinClonedColdBytePercent; -extern cl::opt MinCallsiteColdBytePercent; - static cl::opt MinMatchedColdBytePercent( "memprof-matching-cold-threshold", cl::init(100), cl::Hidden, cl::desc("Min percent of cold bytes matched to hint allocation cold")); @@ -293,13 +289,6 @@ class ModuleMemProfiler { Function *MemProfCtorFunction = nullptr; }; -// Options under which we need to record the context size info in the alloc trie -// used to build metadata. -bool recordContextSizeInfo() { - return MemProfReportHintedSizes || MinClonedColdBytePercent < 100 || - MinCallsiteColdBytePercent < 100; -} - } // end anonymous namespace MemProfilerPass::MemProfilerPass() = default; @@ -752,7 +741,7 @@ static AllocationType addCallStack(CallStackTrie &AllocTrie, AllocInfo->Info.getAllocCount(), AllocInfo->Info.getTotalLifetime()); std::vector ContextSizeInfo; - if (recordContextSizeInfo()) { + if (recordContextSizeInfoForAnalysis()) { auto TotalSize = AllocInfo->Info.getTotalSize(); assert(TotalSize); assert(FullStackId != 0); @@ -958,7 +947,7 @@ readMemprof(Module &M, Function &F, IndexedInstrProfReader *MemProfReader, std::map &FullStackIdToAllocMatchInfo, std::set> &MatchedCallSites, DenseMap &UndriftMaps, - OptimizationRemarkEmitter &ORE) { + OptimizationRemarkEmitter &ORE, uint64_t MaxColdSize) { auto &Ctx = M.getContext(); // Previously we used getIRPGOFuncName() here. If F is local linkage, // getIRPGOFuncName() returns FuncName with prefix 'FileName;'. But @@ -1125,7 +1114,7 @@ readMemprof(Module &M, Function &F, IndexedInstrProfReader *MemProfReader, // We may match this instruction's location list to multiple MIB // contexts. Add them to a Trie specialized for trimming the contexts to // the minimal needed to disambiguate contexts with unique behavior. - CallStackTrie AllocTrie(&ORE); + CallStackTrie AllocTrie(&ORE, MaxColdSize); uint64_t TotalSize = 0; uint64_t TotalColdSize = 0; for (auto *AllocInfo : AllocInfoIter->second) { @@ -1136,7 +1125,7 @@ readMemprof(Module &M, Function &F, IndexedInstrProfReader *MemProfReader, InlinedCallStack)) { NumOfMemProfMatchedAllocContexts++; uint64_t FullStackId = 0; - if (ClPrintMemProfMatchInfo || recordContextSizeInfo()) + if (ClPrintMemProfMatchInfo || recordContextSizeInfoForAnalysis()) FullStackId = computeFullStackId(AllocInfo->CallStack); auto AllocType = addCallStack(AllocTrie, AllocInfo, FullStackId); TotalSize += AllocInfo->Info.getTotalSize(); @@ -1267,6 +1256,10 @@ PreservedAnalyses MemProfUsePass::run(Module &M, ModuleAnalysisManager &AM) { // call stack. std::set> MatchedCallSites; + uint64_t MaxColdSize = 0; + if (auto *MemProfSum = MemProfReader->getMemProfSummary()) + MaxColdSize = MemProfSum->getMaxColdTotalSize(); + for (auto &F : M) { if (F.isDeclaration()) continue; @@ -1274,7 +1267,7 @@ PreservedAnalyses MemProfUsePass::run(Module &M, ModuleAnalysisManager &AM) { const TargetLibraryInfo &TLI = FAM.getResult(F); auto &ORE = FAM.getResult(F); readMemprof(M, F, MemProfReader.get(), TLI, FullStackIdToAllocMatchInfo, - MatchedCallSites, UndriftMaps, ORE); + MatchedCallSites, UndriftMaps, ORE, MaxColdSize); } if (ClPrintMemProfMatchInfo) { diff --git a/llvm/test/ThinLTO/X86/memprof-report-hinted-partial.ll b/llvm/test/ThinLTO/X86/memprof-report-hinted-partial.ll new file mode 100644 index 0000000000000..d4a3f9bca2cab --- /dev/null +++ b/llvm/test/ThinLTO/X86/memprof-report-hinted-partial.ll @@ -0,0 +1,73 @@ +;; Test that we get hinted size reporting for just the subset of MIBs that +;; contain context size info in the metadata. + +;; Generate the bitcode including ThinLTO summary. Specify +;; -memprof-min-percent-max-cold-size (value doesn't matter) to indicate to +;; the bitcode writer that it should expect and optimize for partial context +;; size info. +; RUN: opt -thinlto-bc -memprof-min-percent-max-cold-size=50 %s >%t.o + +; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \ +; RUN: -supports-hot-cold-new \ +; RUN: -r=%t.o,main,plx \ +; RUN: -r=%t.o,_Znam, \ +; RUN: -memprof-report-hinted-sizes \ +; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=SIZES + +;; We should only get these two messages from -memprof-report-hinted-sizes +;; as they are the only MIBs with recorded context size info. +; SIZES-NOT: full allocation context +; SIZES: Cold full allocation context 456 with total size 200 is Cold after cloning (context id 2) +; SIZES: Cold full allocation context 789 with total size 300 is Cold after cloning (context id 2) +; SIZES-NOT: full allocation context + +source_filename = "memprof-report-hinted-partial.ll" +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +define i32 @main() #0 { +entry: + %call = call ptr @_Z3foov(), !callsite !0 + %call1 = call ptr @_Z3foov(), !callsite !1 + ret i32 0 +} + +define internal ptr @_Z3barv() #0 { +entry: + %call = call ptr @_Znam(i64 0), !memprof !2, !callsite !7 + ret ptr null +} + +declare ptr @_Znam(i64) + +define internal ptr @_Z3bazv() #0 { +entry: + %call = call ptr @_Z3barv(), !callsite !8 + ret ptr null +} + +define internal ptr @_Z3foov() #0 { +entry: + %call = call ptr @_Z3bazv(), !callsite !9 + ret ptr null +} + +; uselistorder directives +uselistorder ptr @_Z3foov, { 1, 0 } + +attributes #0 = { noinline optnone } + +!0 = !{i64 8632435727821051414} +!1 = !{i64 -3421689549917153178} +!2 = !{!3, !5, !13} +!3 = !{!4, !"notcold"} +!4 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 8632435727821051414} +!5 = !{!6, !"cold", !11, !12} +!6 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 -3421689549917153178} +!7 = !{i64 9086428284934609951} +!8 = !{i64 -5964873800580613432} +!9 = !{i64 2732490490862098848} +!11 = !{i64 456, i64 200} +!12 = !{i64 789, i64 300} +!13 = !{!14, !"cold"} +!14 = !{i64 9086428284934609951, i64 12345} diff --git a/llvm/test/Transforms/PGOProfile/memprof_max_cold_threshold.test b/llvm/test/Transforms/PGOProfile/memprof_max_cold_threshold.test new file mode 100644 index 0000000000000..f96b3f0618102 --- /dev/null +++ b/llvm/test/Transforms/PGOProfile/memprof_max_cold_threshold.test @@ -0,0 +1,131 @@ +;; Test the -memprof-min-percent-max-cold-size flag for annotating only the +;; largest contexts with size info. + +; REQUIRES: x86_64-linux + +; RUN: split-file %s %t + +;; Specify version 4 so that a summary is generated. +; RUN: llvm-profdata merge --memprof-version=4 %t/memprof_max_cold_threshold.yaml -o %t/memprof_max_cold_threshold.memprofdata + +; RUN: llvm-profdata show --memory %t/memprof_max_cold_threshold.memprofdata | FileCheck %s --check-prefixes=SUMMARY +; SUMMARY: # MemProfSummary: +; SUMMARY: # Total contexts: 5 +; SUMMARY: # Total cold contexts: 4 +; SUMMARY: # Total hot contexts: 0 +; SUMMARY: # Maximum cold context total size: 400 +; SUMMARY: # Maximum warm context total size: 500 +; SUMMARY: # Maximum hot context total size: 0 + +;; Test with various thresholds, starting with the baseline of no context size +;; info. +; RUN: opt < %t/memprof_max_cold_threshold.ll -passes='memprof-use' -S | FileCheck %s --check-prefixes=ALL,EXCL100,EXCL75,EXCL50 +; RUN: opt < %t/memprof_max_cold_threshold.ll -memprof-min-percent-max-cold-size=99 -passes='memprof-use' -S | FileCheck %s --check-prefixes=ALL,INCL100,EXCL75,EXCL50 +; RUN: opt < %t/memprof_max_cold_threshold.ll -memprof-min-percent-max-cold-size=75 -passes='memprof-use' -S | FileCheck %s --check-prefixes=ALL,INCL100,INCL75,EXCL50 +; RUN: opt < %t/memprof_max_cold_threshold.ll -memprof-min-percent-max-cold-size=50 -passes='memprof-use' -S | FileCheck %s --check-prefixes=ALL,INCL100,INCL75,INCL50 + +;; Make sure serializing / deserializing through bitcode works +; RUN: opt < %t/memprof_max_cold_threshold.ll -memprof-min-percent-max-cold-size=75 -passes='memprof-use' -o %t/out.bc +; RUN: llvm-dis %t/out.bc -o - | FileCheck %s --check-prefixes=ALL,INCL100,INCL75,EXCL50 + +;--- memprof_max_cold_threshold.yaml +--- +HeapProfileRecords: + - GUID: _Z3foov + AllocSites: + - Callstack: + - { Function: _Z3foov, LineOffset: 0, Column: 22, IsInlineFrame: false } + - { Function: main, LineOffset: 2, Column: 5, IsInlineFrame: false } + MemInfoBlock: + # Smallest cold, 25% of largest cold size + TotalSize: 100 + AllocCount: 1 + TotalLifetimeAccessDensity: 1 + TotalLifetime: 1000000 + - Callstack: + - { Function: _Z3foov, LineOffset: 0, Column: 22, IsInlineFrame: false } + - { Function: main, LineOffset: 3, Column: 5, IsInlineFrame: false } + MemInfoBlock: + # Largest cold + TotalSize: 400 + AllocCount: 1 + TotalLifetimeAccessDensity: 1 + TotalLifetime: 1000000 + - Callstack: + - { Function: _Z3foov, LineOffset: 0, Column: 22, IsInlineFrame: false } + - { Function: main, LineOffset: 4, Column: 5, IsInlineFrame: false } + MemInfoBlock: + # Second largest cold, 75% of largest cold size + TotalSize: 300 + AllocCount: 1 + TotalLifetimeAccessDensity: 1 + TotalLifetime: 1000000 + - Callstack: + - { Function: _Z3foov, LineOffset: 0, Column: 22, IsInlineFrame: false } + - { Function: main, LineOffset: 5, Column: 5, IsInlineFrame: false } + MemInfoBlock: + # Second smallest cold, 50% of largest cold size + TotalSize: 200 + AllocCount: 1 + TotalLifetimeAccessDensity: 1 + TotalLifetime: 1000000 + - Callstack: + - { Function: _Z3foov, LineOffset: 0, Column: 22, IsInlineFrame: false } + - { Function: main, LineOffset: 6, Column: 5, IsInlineFrame: false } + MemInfoBlock: + # Largest context, which is non-cold (due to short lifetime) + TotalSize: 500 + AllocCount: 1 + TotalLifetimeAccessDensity: 1 + TotalLifetime: 1 + CallSites: [] +... +;--- memprof_max_cold_threshold.ll +define dso_local ptr @_Z3foov() !dbg !4 { +entry: + %call = call ptr @_Znam(i64 4) #0, !dbg !5 +; ALL: call ptr @_Znam(i64 4) {{.*}} !memprof ![[M1:[0-9]+]] + ret ptr %call +} + +; ALL: ![[M1]] = !{![[M2:[0-9]+]], ![[M3:[0-9]+]], ![[M4:[0-9]+]], ![[M5:[0-9]+]], ![[M6:[0-9]+]]} +;; The smallest never gets context size info. +; ALL: ![[M2]] = !{![[C2:[0-9]+]], !"cold"} +; ALL: ![[C2]] = !{i64 590523745590780990, i64 720385627691022109} +; ALL: ![[M3]] = !{![[C3:[0-9]+]], !"cold" +; EXCL100-NOT: ! +; EXCL100-SAME: } +; INCL100-SAME: , ![[S3:[0-9]+]]} +; ALL: ![[C3]] = !{i64 590523745590780990, i64 8256520048276991898} +; INCL100: ![[S3]] = !{i64 6509573709067523871, i64 400} +; ALL: ![[M4]] = !{![[C4:[0-9]+]], !"cold" +; EXCL50-NOT: ! +; EXCL50-SAME: } +; INCL50-SAME: , ![[S4:[0-9]+]]} +; ALL: ![[C4]] = !{i64 590523745590780990, i64 -6953100768213558995} +; INCL50: ![[S4]] = !{i64 -2629930016428010893, i64 200} +; ALL: ![[M5]] = !{![[C5:[0-9]+]], !"cold" +; EXCL75-NOT: ! +; EXCL75-SAME: } +; INCL75-SAME: , ![[S5:[0-9]+]]} +; ALL: ![[C5]] = !{i64 590523745590780990, i64 -6435117705372285425} +; INCL75: ![[S5]] = !{i64 86248815258696712, i64 300} +;; Non cold context never gets context size info +; ALL: ![[M6]] = !{![[C6:[0-9]+]], !"notcold"} +; ALL: ![[C6]] = !{i64 590523745590780990, i64 -2847840206325626610} + +declare ptr @_Znam(i64) + +attributes #0 = { builtin allocsize(0) } + +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1) +!1 = !DIFile(filename: "t", directory: "/") +!2 = !{i32 7, !"Dwarf Version", i32 5} +!3 = !{i32 2, !"Debug Info Version", i32 3} +!4 = distinct !DISubprogram(name: "foo", linkageName: "_Z3foov", scope: !1, file: !1, line: 1, unit: !0) +!5 = !DILocation(line: 1, column: 22, scope: !4) +!6 = !DILocation(line: 2, column: 22, scope: !4) +!7 = !DILocation(line: 3, column: 22, scope: !4) +!8 = !DILocation(line: 4, column: 22, scope: !4) From 57f2b5adde00e30e73b07402541d1c55248a23de Mon Sep 17 00:00:00 2001 From: Teresa Johnson Date: Tue, 3 Jun 2025 10:53:40 -0700 Subject: [PATCH 2/3] Address comments --- llvm/lib/Analysis/MemoryProfileInfo.cpp | 10 +- llvm/lib/Analysis/ModuleSummaryAnalysis.cpp | 32 ++++--- .../memprof_max_cold_threshold.test | 92 +++++++++++++------ 3 files changed, 85 insertions(+), 49 deletions(-) diff --git a/llvm/lib/Analysis/MemoryProfileInfo.cpp b/llvm/lib/Analysis/MemoryProfileInfo.cpp index 2ea765be5dbe6..c5a1a6de8379c 100644 --- a/llvm/lib/Analysis/MemoryProfileInfo.cpp +++ b/llvm/lib/Analysis/MemoryProfileInfo.cpp @@ -187,8 +187,8 @@ void CallStackTrie::addCallStack(MDNode *MIB) { static MDNode *createMIBNode(LLVMContext &Ctx, ArrayRef MIBCallStack, AllocationType AllocType, ArrayRef ContextSizeInfo, - uint64_t &TotalBytes, uint64_t &ColdBytes, - uint64_t MaxColdSize) { + const uint64_t MaxColdSize, + uint64_t &TotalBytes, uint64_t &ColdBytes) { SmallVector MIBPayload( {buildCallstackMetadata(MIBCallStack, Ctx)}); MIBPayload.push_back( @@ -388,7 +388,7 @@ bool CallStackTrie::buildMIBNodes(CallStackTrieNode *Node, LLVMContext &Ctx, collectContextSizeInfo(Node, ContextSizeInfo); MIBNodes.push_back( createMIBNode(Ctx, MIBCallStack, (AllocationType)Node->AllocTypes, - ContextSizeInfo, TotalBytes, ColdBytes, MaxColdSize)); + ContextSizeInfo, MaxColdSize, TotalBytes, ColdBytes)); return true; } @@ -442,8 +442,8 @@ bool CallStackTrie::buildMIBNodes(CallStackTrieNode *Node, LLVMContext &Ctx, std::vector ContextSizeInfo; collectContextSizeInfo(Node, ContextSizeInfo); MIBNodes.push_back(createMIBNode(Ctx, MIBCallStack, AllocationType::NotCold, - ContextSizeInfo, TotalBytes, ColdBytes, - MaxColdSize)); + ContextSizeInfo, MaxColdSize, TotalBytes, + ColdBytes)); return true; } diff --git a/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp b/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp index 332122700c886..a317ac471a231 100644 --- a/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp +++ b/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp @@ -525,6 +525,7 @@ static void computeFunctionSummary( if (MemProfMD) { std::vector MIBs; std::vector> ContextSizeInfos; + bool HasNonZeroContextSizeInfos = false; for (auto &MDOp : MemProfMD->operands()) { auto *MIBMD = cast(MDOp); MDNode *StackNode = getMIBStackNode(MIBMD); @@ -559,28 +560,31 @@ static void computeFunctionSummary( ->getZExtValue(); ContextSizes.push_back({FullStackId, TS}); } + // Flag that we need to keep the ContextSizeInfos array for this + // alloc as it now contains non-zero context info sizes. + HasNonZeroContextSizeInfos = true; + ContextSizeInfos.push_back(std::move(ContextSizes)); + } else { // The ContextSizeInfos must be in the same relative position as the // associated MIB. In some cases we only include a ContextSizeInfo - // for a subset of MIBs in an allocation. In those cases we insert - // 0s for the other MIBs. Handle the case where the first - // ContextSizeInfo being inserted is not for the first MIB, insert - // a pair of 0s for each of the prior MIBs. - if (ContextSizeInfos.empty() && !MIBs.empty()) - ContextSizeInfos.insert(ContextSizeInfos.begin(), MIBs.size(), - {{0, 0}}); - ContextSizeInfos.push_back(std::move(ContextSizes)); - } else if (!ContextSizeInfos.empty()) { - // See earlier comment about handling case of ContextSizeInfos only - // for a subset of MIBs. Insert a pair of 0s for this MIB as it does - // not have a ContextSizeInfo but other MIBs did. + // for a subset of MIBs in an allocation. To handle that, eagerly + // fill any MIB entries that don't have context size info metadata + // with a pair of 0s. Later on we will only use this array if it + // ends up containing any non-zero entries (see where we set + // HasNonZeroContextSizeInfos above). ContextSizeInfos.push_back({{0, 0}}); } MIBs.push_back( MIBInfo(getMIBAllocType(MIBMD), std::move(StackIdIndices))); } Allocs.push_back(AllocInfo(std::move(MIBs))); - assert(!ContextSizeInfos.empty() || !MemProfReportHintedSizes); - if (!ContextSizeInfos.empty()) { + assert(HasNonZeroContextSizeInfos || + !metadataIncludesAllContextSizeInfo()); + // We eagerly build the ContextSizeInfos array, but it will be filled + // with sub arrays of pairs of 0s if no MIBs on this alloc actually + // contained context size info metadata. Only save it if any MIBs had + // any such metadata. + if (HasNonZeroContextSizeInfos) { assert(Allocs.back().MIBs.size() == ContextSizeInfos.size()); Allocs.back().ContextSizeInfos = std::move(ContextSizeInfos); } diff --git a/llvm/test/Transforms/PGOProfile/memprof_max_cold_threshold.test b/llvm/test/Transforms/PGOProfile/memprof_max_cold_threshold.test index f96b3f0618102..8ccaa170b0ed9 100644 --- a/llvm/test/Transforms/PGOProfile/memprof_max_cold_threshold.test +++ b/llvm/test/Transforms/PGOProfile/memprof_max_cold_threshold.test @@ -1,8 +1,6 @@ ;; Test the -memprof-min-percent-max-cold-size flag for annotating only the ;; largest contexts with size info. -; REQUIRES: x86_64-linux - ; RUN: split-file %s %t ;; Specify version 4 so that a summary is generated. @@ -19,14 +17,14 @@ ;; Test with various thresholds, starting with the baseline of no context size ;; info. -; RUN: opt < %t/memprof_max_cold_threshold.ll -passes='memprof-use' -S | FileCheck %s --check-prefixes=ALL,EXCL100,EXCL75,EXCL50 -; RUN: opt < %t/memprof_max_cold_threshold.ll -memprof-min-percent-max-cold-size=99 -passes='memprof-use' -S | FileCheck %s --check-prefixes=ALL,INCL100,EXCL75,EXCL50 -; RUN: opt < %t/memprof_max_cold_threshold.ll -memprof-min-percent-max-cold-size=75 -passes='memprof-use' -S | FileCheck %s --check-prefixes=ALL,INCL100,INCL75,EXCL50 -; RUN: opt < %t/memprof_max_cold_threshold.ll -memprof-min-percent-max-cold-size=50 -passes='memprof-use' -S | FileCheck %s --check-prefixes=ALL,INCL100,INCL75,INCL50 +; RUN: opt < %t/memprof_max_cold_threshold.ll -passes='memprof-use' -S | FileCheck %s --check-prefixes=ALL,NONE +; RUN: opt < %t/memprof_max_cold_threshold.ll -memprof-min-percent-max-cold-size=99 -passes='memprof-use' -S | FileCheck %s --check-prefixes=ALL,THRESH99 +; RUN: opt < %t/memprof_max_cold_threshold.ll -memprof-min-percent-max-cold-size=75 -passes='memprof-use' -S | FileCheck %s --check-prefixes=ALL,THRESH75 +; RUN: opt < %t/memprof_max_cold_threshold.ll -memprof-min-percent-max-cold-size=50 -passes='memprof-use' -S | FileCheck %s --check-prefixes=ALL,THRESH50 ;; Make sure serializing / deserializing through bitcode works ; RUN: opt < %t/memprof_max_cold_threshold.ll -memprof-min-percent-max-cold-size=75 -passes='memprof-use' -o %t/out.bc -; RUN: llvm-dis %t/out.bc -o - | FileCheck %s --check-prefixes=ALL,INCL100,INCL75,EXCL50 +; RUN: llvm-dis %t/out.bc -o - | FileCheck %s --check-prefixes=ALL,THRESH75 ;--- memprof_max_cold_threshold.yaml --- @@ -88,31 +86,65 @@ entry: ret ptr %call } -; ALL: ![[M1]] = !{![[M2:[0-9]+]], ![[M3:[0-9]+]], ![[M4:[0-9]+]], ![[M5:[0-9]+]], ![[M6:[0-9]+]]} +;; No MIBs get context size info without option. +; NONE: ![[M1]] = !{![[M2:[0-9]+]], ![[M3:[0-9]+]], ![[M4:[0-9]+]], ![[M5:[0-9]+]], ![[M6:[0-9]+]]} +; NONE: ![[M2]] = !{![[C2:[0-9]+]], !"cold"} +; NONE: ![[C2]] = !{i64 590523745590780990, i64 720385627691022109} +; NONE: ![[M3]] = !{![[C3:[0-9]+]], !"cold"} +; NONE: ![[C3]] = !{i64 590523745590780990, i64 8256520048276991898} +; NONE: ![[M4]] = !{![[C4:[0-9]+]], !"cold"} +; NONE: ![[C4]] = !{i64 590523745590780990, i64 -6953100768213558995} +; NONE: ![[M5]] = !{![[C5:[0-9]+]], !"cold"} +; NONE: ![[C5]] = !{i64 590523745590780990, i64 -6435117705372285425} +; NONE: ![[M6]] = !{![[C6:[0-9]+]], !"notcold"} +; NONE: ![[C6]] = !{i64 590523745590780990, i64 -2847840206325626610} + +; THRESH99: ![[M1]] = !{![[M2:[0-9]+]], ![[M3:[0-9]+]], ![[M4:[0-9]+]], ![[M5:[0-9]+]], ![[M6:[0-9]+]]} +; THRESH99: ![[M2]] = !{![[C2:[0-9]+]], !"cold"} +; THRESH99: ![[C2]] = !{i64 590523745590780990, i64 720385627691022109} +;; MIB with size 400 is now included with threshold 99% +; THRESH99: ![[M3]] = !{![[C3:[0-9]+]], !"cold", ![[S3:[0-9]+]]} +; THRESH99: ![[C3]] = !{i64 590523745590780990, i64 8256520048276991898} +; THRESH99: ![[S3]] = !{i64 6509573709067523871, i64 400} +; THRESH99: ![[M4]] = !{![[C4:[0-9]+]], !"cold"} +; THRESH99: ![[C4]] = !{i64 590523745590780990, i64 -6953100768213558995} +; THRESH99: ![[M5]] = !{![[C5:[0-9]+]], !"cold"} +; THRESH99: ![[C5]] = !{i64 590523745590780990, i64 -6435117705372285425} +; THRESH99: ![[M6]] = !{![[C6:[0-9]+]], !"notcold"} +; THRESH99: ![[C6]] = !{i64 590523745590780990, i64 -2847840206325626610} + +; THRESH75: ![[M1]] = !{![[M2:[0-9]+]], ![[M3:[0-9]+]], ![[M4:[0-9]+]], ![[M5:[0-9]+]], ![[M6:[0-9]+]]} +; THRESH75: ![[M2]] = !{![[C2:[0-9]+]], !"cold"} +; THRESH75: ![[C2]] = !{i64 590523745590780990, i64 720385627691022109} +; THRESH75: ![[M3]] = !{![[C3:[0-9]+]], !"cold", ![[S3:[0-9]+]]} +; THRESH75: ![[C3]] = !{i64 590523745590780990, i64 8256520048276991898} +; THRESH75: ![[S3]] = !{i64 6509573709067523871, i64 400} +; THRESH75: ![[M4]] = !{![[C4:[0-9]+]], !"cold"} +; THRESH75: ![[C4]] = !{i64 590523745590780990, i64 -6953100768213558995} +;; MIB with size 300 is now included with threshold 75% +; THRESH75: ![[M5]] = !{![[C5:[0-9]+]], !"cold", ![[S5:[0-9]+]]} +; THRESH75: ![[C5]] = !{i64 590523745590780990, i64 -6435117705372285425} +; THRESH75: ![[S5]] = !{i64 86248815258696712, i64 300} +; THRESH75: ![[M6]] = !{![[C6:[0-9]+]], !"notcold"} +; THRESH75: ![[C6]] = !{i64 590523745590780990, i64 -2847840206325626610} + +; THRESH50: ![[M1]] = !{![[M2:[0-9]+]], ![[M3:[0-9]+]], ![[M4:[0-9]+]], ![[M5:[0-9]+]], ![[M6:[0-9]+]]} ;; The smallest never gets context size info. -; ALL: ![[M2]] = !{![[C2:[0-9]+]], !"cold"} -; ALL: ![[C2]] = !{i64 590523745590780990, i64 720385627691022109} -; ALL: ![[M3]] = !{![[C3:[0-9]+]], !"cold" -; EXCL100-NOT: ! -; EXCL100-SAME: } -; INCL100-SAME: , ![[S3:[0-9]+]]} -; ALL: ![[C3]] = !{i64 590523745590780990, i64 8256520048276991898} -; INCL100: ![[S3]] = !{i64 6509573709067523871, i64 400} -; ALL: ![[M4]] = !{![[C4:[0-9]+]], !"cold" -; EXCL50-NOT: ! -; EXCL50-SAME: } -; INCL50-SAME: , ![[S4:[0-9]+]]} -; ALL: ![[C4]] = !{i64 590523745590780990, i64 -6953100768213558995} -; INCL50: ![[S4]] = !{i64 -2629930016428010893, i64 200} -; ALL: ![[M5]] = !{![[C5:[0-9]+]], !"cold" -; EXCL75-NOT: ! -; EXCL75-SAME: } -; INCL75-SAME: , ![[S5:[0-9]+]]} -; ALL: ![[C5]] = !{i64 590523745590780990, i64 -6435117705372285425} -; INCL75: ![[S5]] = !{i64 86248815258696712, i64 300} +; THRESH50: ![[M2]] = !{![[C2:[0-9]+]], !"cold"} +; THRESH50: ![[C2]] = !{i64 590523745590780990, i64 720385627691022109} +; THRESH50: ![[M3]] = !{![[C3:[0-9]+]], !"cold", ![[S3:[0-9]+]]} +; THRESH50: ![[C3]] = !{i64 590523745590780990, i64 8256520048276991898} +; THRESH50: ![[S3]] = !{i64 6509573709067523871, i64 400} +;; MIB with size 200 is now included with threshold 50% +; THRESH50: ![[M4]] = !{![[C4:[0-9]+]], !"cold", ![[S4:[0-9]+]]} +; THRESH50: ![[C4]] = !{i64 590523745590780990, i64 -6953100768213558995} +; THRESH50: ![[S4]] = !{i64 -2629930016428010893, i64 200} +; THRESH50: ![[M5]] = !{![[C5:[0-9]+]], !"cold", ![[S5:[0-9]+]]} +; THRESH50: ![[C5]] = !{i64 590523745590780990, i64 -6435117705372285425} +; THRESH50: ![[S5]] = !{i64 86248815258696712, i64 300} ;; Non cold context never gets context size info -; ALL: ![[M6]] = !{![[C6:[0-9]+]], !"notcold"} -; ALL: ![[C6]] = !{i64 590523745590780990, i64 -2847840206325626610} +; THRESH50: ![[M6]] = !{![[C6:[0-9]+]], !"notcold"} +; THRESH50: ![[C6]] = !{i64 590523745590780990, i64 -2847840206325626610} declare ptr @_Znam(i64) From 98f4b7e897c1ae911dd66eea7da8748391443631 Mon Sep 17 00:00:00 2001 From: Teresa Johnson Date: Tue, 3 Jun 2025 13:26:05 -0700 Subject: [PATCH 3/3] Update MemoryProfileInfo.cpp clang format fix --- llvm/lib/Analysis/MemoryProfileInfo.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Analysis/MemoryProfileInfo.cpp b/llvm/lib/Analysis/MemoryProfileInfo.cpp index c5a1a6de8379c..c08024a38ffc2 100644 --- a/llvm/lib/Analysis/MemoryProfileInfo.cpp +++ b/llvm/lib/Analysis/MemoryProfileInfo.cpp @@ -187,8 +187,8 @@ void CallStackTrie::addCallStack(MDNode *MIB) { static MDNode *createMIBNode(LLVMContext &Ctx, ArrayRef MIBCallStack, AllocationType AllocType, ArrayRef ContextSizeInfo, - const uint64_t MaxColdSize, - uint64_t &TotalBytes, uint64_t &ColdBytes) { + const uint64_t MaxColdSize, uint64_t &TotalBytes, + uint64_t &ColdBytes) { SmallVector MIBPayload( {buildCallstackMetadata(MIBCallStack, Ctx)}); MIBPayload.push_back(