diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp index cd4b9f557fb49..1bf7ff468d782 100644 --- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp +++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp @@ -140,6 +140,7 @@ cl::opt MemProfRequireDefinitionForPromotion( } // namespace llvm extern cl::opt MemProfReportHintedSizes; +extern cl::opt MinClonedColdBytePercent; namespace { /// CRTP base for graphs built from either IR or ThinLTO summary index. @@ -617,6 +618,11 @@ class CallsiteContextGraph { static_cast(this)->updateAllocationCall(Call, AllocType); } + /// Get the AllocationType assigned to the given allocation instruction clone. + AllocationType getAllocationCallType(const CallInfo &Call) const { + return static_cast(this)->getAllocationCallType(Call); + } + /// Update non-allocation call to invoke (possibly cloned) function /// CalleeFunc. void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc) { @@ -711,7 +717,8 @@ class CallsiteContextGraph { /// Map from each contextID to the profiled full contexts and their total /// sizes (there may be more than one due to context trimming), - /// optionally populated when requested (via MemProfReportHintedSizes). + /// optionally populated when requested (via MemProfReportHintedSizes or + /// MinClonedColdBytePercent). DenseMap> ContextIdToContextSizeInfos; /// Identifies the context node created for a stack id when adding the MIB @@ -773,6 +780,7 @@ class ModuleCallsiteContextGraph uint64_t getLastStackId(Instruction *Call); std::vector getStackIdsWithContextNodesForCall(Instruction *Call); void updateAllocationCall(CallInfo &Call, AllocationType AllocType); + AllocationType getAllocationCallType(const CallInfo &Call) const; void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc); CallsiteContextGraph::FuncInfo @@ -852,6 +860,7 @@ class IndexCallsiteContextGraph uint64_t getLastStackId(IndexCall &Call); std::vector getStackIdsWithContextNodesForCall(IndexCall &Call); void updateAllocationCall(CallInfo &Call, AllocationType AllocType); + AllocationType getAllocationCallType(const CallInfo &Call) const; void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc); CallsiteContextGraph::FuncInfo @@ -1201,8 +1210,7 @@ void CallsiteContextGraph::addStackNodesForMIB( ContextIdToAllocationType[++LastContextId] = AllocType; - if (MemProfReportHintedSizes) { - assert(!ContextSizeInfo.empty()); + if (!ContextSizeInfo.empty()) { auto &Entry = ContextIdToContextSizeInfos[LastContextId]; Entry.insert(Entry.begin(), ContextSizeInfo.begin(), ContextSizeInfo.end()); } @@ -2043,14 +2051,15 @@ IndexCallsiteContextGraph::IndexCallsiteContextGraph( CallStack::const_iterator> EmptyContext; unsigned I = 0; - assert(!MemProfReportHintedSizes || - AN.ContextSizeInfos.size() == AN.MIBs.size()); + assert( + (!MemProfReportHintedSizes && MinClonedColdBytePercent >= 100) || + AN.ContextSizeInfos.size() == AN.MIBs.size()); // Now add all of the MIBs and their stack nodes. for (auto &MIB : AN.MIBs) { CallStack::const_iterator> StackContext(&MIB); std::vector ContextSizeInfo; - if (MemProfReportHintedSizes) { + if (!AN.ContextSizeInfos.empty()) { for (auto [FullStackId, TotalSize] : AN.ContextSizeInfos[I]) ContextSizeInfo.push_back({FullStackId, TotalSize}); } @@ -2825,6 +2834,7 @@ void CallsiteContextGraph::printTotalSizes( if (!Node->IsAllocation) continue; DenseSet ContextIds = Node->getContextIds(); + auto AllocTypeFromCall = getAllocationCallType(Node->Call); std::vector SortedIds(ContextIds.begin(), ContextIds.end()); std::sort(SortedIds.begin(), SortedIds.end()); for (auto Id : SortedIds) { @@ -2837,7 +2847,11 @@ void CallsiteContextGraph::printTotalSizes( << getAllocTypeString((uint8_t)TypeI->second) << " full allocation context " << Info.FullStackId << " with total size " << Info.TotalSize << " is " - << getAllocTypeString(Node->AllocTypes) << " after cloning\n"; + << getAllocTypeString(Node->AllocTypes) << " after cloning"; + if (allocTypeToUse(Node->AllocTypes) != AllocTypeFromCall) + OS << " marked " << getAllocTypeString((uint8_t)AllocTypeFromCall) + << " due to cold byte percent"; + OS << "\n"; } } } @@ -3487,6 +3501,23 @@ void IndexCallsiteContextGraph::updateAllocationCall(CallInfo &Call, AI->Versions[Call.cloneNo()] = (uint8_t)AllocType; } +AllocationType +ModuleCallsiteContextGraph::getAllocationCallType(const CallInfo &Call) const { + const auto *CB = cast(Call.call()); + if (!CB->getAttributes().hasFnAttr("memprof")) + return AllocationType::None; + return CB->getAttributes().getFnAttr("memprof").getValueAsString() == "cold" + ? AllocationType::Cold + : AllocationType::NotCold; +} + +AllocationType +IndexCallsiteContextGraph::getAllocationCallType(const CallInfo &Call) const { + const auto *AI = Call.call().dyn_cast(); + assert(AI->Versions.size() > Call.cloneNo()); + return (AllocationType)AI->Versions[Call.cloneNo()]; +} + void ModuleCallsiteContextGraph::updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc) { if (CalleeFunc.cloneNo() > 0) @@ -4017,6 +4048,9 @@ bool CallsiteContextGraph::assignFunctions() { } } + uint8_t BothTypes = + (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold; + auto UpdateCalls = [&](ContextNode *Node, DenseSet &Visited, auto &&UpdateCalls) { @@ -4036,7 +4070,31 @@ bool CallsiteContextGraph::assignFunctions() { return; if (Node->IsAllocation) { - updateAllocationCall(Node->Call, allocTypeToUse(Node->AllocTypes)); + auto AT = allocTypeToUse(Node->AllocTypes); + // If the allocation type is ambiguous, and more aggressive hinting + // has been enabled via the MinClonedColdBytePercent flag, see if this + // allocation should be hinted cold anyway because its fraction cold bytes + // allocated is at least the given threshold. + if (Node->AllocTypes == BothTypes && MinClonedColdBytePercent < 100 && + !ContextIdToContextSizeInfos.empty()) { + uint64_t TotalCold = 0; + uint64_t Total = 0; + for (auto Id : Node->getContextIds()) { + auto TypeI = ContextIdToAllocationType.find(Id); + assert(TypeI != ContextIdToAllocationType.end()); + auto CSI = ContextIdToContextSizeInfos.find(Id); + if (CSI != ContextIdToContextSizeInfos.end()) { + for (auto &Info : CSI->second) { + Total += Info.TotalSize; + if (TypeI->second == AllocationType::Cold) + TotalCold += Info.TotalSize; + } + } + } + if (TotalCold * 100 >= Total * MinClonedColdBytePercent) + AT = AllocationType::Cold; + } + updateAllocationCall(Node->Call, AT); assert(Node->MatchingCalls.empty()); return; } @@ -4419,7 +4477,11 @@ bool MemProfContextDisambiguation::applyImport(Module &M) { // will still be none type or should have gotten the default NotCold. // Skip that after calling clone helper since that does some sanity // checks that confirm we haven't decided yet that we need cloning. - if (AllocNode.Versions.size() == 1) { + // We might have a single version that is cold due to the + // MinClonedColdBytePercent heuristic, make sure we don't skip in that + // case. + if (AllocNode.Versions.size() == 1 && + (AllocationType)AllocNode.Versions[0] != AllocationType::Cold) { assert((AllocationType)AllocNode.Versions[0] == AllocationType::NotCold || (AllocationType)AllocNode.Versions[0] == diff --git a/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp b/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp index db8999911b7f9..9831b37fb4770 100644 --- a/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp +++ b/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp @@ -176,6 +176,10 @@ static cl::opt cl::desc("Salvage stale MemProf profile"), cl::init(false), cl::Hidden); +cl::opt MinClonedColdBytePercent( + "memprof-cloning-cold-threshold", cl::init(100), cl::Hidden, + cl::desc("Min percent of cold bytes to hint alloc cold during cloning")); + extern cl::opt MemProfReportHintedSizes; static cl::opt MinMatchedColdBytePercent( @@ -755,7 +759,7 @@ static AllocationType addCallStack(CallStackTrie &AllocTrie, AllocInfo->Info.getAllocCount(), AllocInfo->Info.getTotalLifetime()); std::vector ContextSizeInfo; - if (MemProfReportHintedSizes) { + if (MemProfReportHintedSizes || MinClonedColdBytePercent < 100) { auto TotalSize = AllocInfo->Info.getTotalSize(); assert(TotalSize); assert(FullStackId != 0); @@ -1126,7 +1130,8 @@ readMemprof(Module &M, Function &F, IndexedInstrProfReader *MemProfReader, InlinedCallStack)) { NumOfMemProfMatchedAllocContexts++; uint64_t FullStackId = 0; - if (ClPrintMemProfMatchInfo || MemProfReportHintedSizes) + if (ClPrintMemProfMatchInfo || MemProfReportHintedSizes || + MinClonedColdBytePercent < 100) FullStackId = computeFullStackId(AllocInfo->CallStack); auto AllocType = addCallStack(AllocTrie, AllocInfo, FullStackId); TotalSize += AllocInfo->Info.getTotalSize(); diff --git a/llvm/test/ThinLTO/X86/memprof-missing-callsite.ll b/llvm/test/ThinLTO/X86/memprof-missing-callsite.ll index e7fbb3c0bad2c..5ce786c63ac1a 100644 --- a/llvm/test/ThinLTO/X86/memprof-missing-callsite.ll +++ b/llvm/test/ThinLTO/X86/memprof-missing-callsite.ll @@ -1,6 +1,7 @@ ;; Test callsite context graph generation for simple call graph with ;; two memprof contexts and no inlining, where one callsite required for -;; cloning is missing (e.g. unmatched). +;; cloning is missing (e.g. unmatched). Use this to test aggressive hinting +;; threshold. ;; ;; Original code looks like: ;; @@ -30,6 +31,30 @@ ; RUN: --check-prefix=SIZESUNHINTED ; RUN: llvm-dis %t.out.1.4.opt.bc -o - | FileCheck %s --implicit-check-not "\"memprof\"=\"cold\"" +;; Check that we do hint with a sufficient -memprof-cloning-cold-threshold. +; RUN: opt -thinlto-bc -memprof-report-hinted-sizes %s >%t.o +; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \ +; RUN: -supports-hot-cold-new \ +; RUN: -r=%t.o,main,plx \ +; RUN: -r=%t.o,_Znam, \ +; RUN: -memprof-report-hinted-sizes -memprof-cloning-cold-threshold=80 \ +; RUN: -pass-remarks=memprof-context-disambiguation -save-temps \ +; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=REMARKSHINTED \ +; RUN: --check-prefix=SIZESHINTED +; RUN: llvm-dis %t.out.1.4.opt.bc -o - | FileCheck %s --check-prefix=IRHINTED + +;; Check again that we hint with a sufficient -memprof-cloning-cold-threshold, +;; even if we don't specify -memprof-report-hinted-sizes. +; RUN: opt -thinlto-bc -memprof-report-hinted-sizes %s >%t.o +; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \ +; RUN: -supports-hot-cold-new \ +; RUN: -r=%t.o,main,plx \ +; RUN: -r=%t.o,_Znam, \ +; RUN: -memprof-cloning-cold-threshold=80 \ +; RUN: -pass-remarks=memprof-context-disambiguation -save-temps \ +; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=REMARKSHINTED +; RUN: llvm-dis %t.out.1.4.opt.bc -o - | FileCheck %s --check-prefix=IRHINTED + source_filename = "memprof-missing-callsite.ll" target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" @@ -68,3 +93,11 @@ attributes #0 = { noinline optnone } ; SIZESUNHINTED: NotCold full allocation context 123 with total size 100 is NotColdCold after cloning ; SIZESUNHINTED: Cold full allocation context 456 with total size 200 is NotColdCold after cloning ; SIZESUNHINTED: Cold full allocation context 789 with total size 300 is NotColdCold after cloning + +; SIZESHINTED: NotCold full allocation context 123 with total size 100 is NotColdCold after cloning marked Cold due to cold byte percent +; SIZESHINTED: Cold full allocation context 456 with total size 200 is NotColdCold after cloning marked Cold due to cold byte percent +; SIZESHINTED: Cold full allocation context 789 with total size 300 is NotColdCold after cloning marked Cold due to cold byte percent + +; REMARKSHINTED: call in clone _Z3foov marked with memprof allocation attribute cold + +; IRHINTED: "memprof"="cold" diff --git a/llvm/test/Transforms/PGOProfile/memprof.ll b/llvm/test/Transforms/PGOProfile/memprof.ll index 7e47c8ded4e4a..c0e44cccbf16f 100644 --- a/llvm/test/Transforms/PGOProfile/memprof.ll +++ b/llvm/test/Transforms/PGOProfile/memprof.ll @@ -69,6 +69,10 @@ ;; Check that we hint additional allocations with a threshold < 100% ; RUN: opt < %s -passes='memprof-use' -pgo-warn-missing-function -S -memprof-report-hinted-sizes -memprof-matching-cold-threshold=60 2>&1 | FileCheck %s --check-prefixes=TOTALSIZESSINGLE,TOTALSIZESTHRESH60 +;; Make sure that the -memprof-cloning-cold-threshold flag is enough to cause +;; the size metadata to be generated for the LTO link. +; RUN: opt < %s -passes='memprof-use' -pgo-warn-missing-function -S -memprof-cloning-cold-threshold=80 2>&1 | FileCheck %s --check-prefixes=TOTALSIZES + ;; Make sure we emit a random hotness seed if requested. ; RUN: llvm-profdata merge -memprof-random-hotness %S/Inputs/memprof.memprofraw --profiled-binary %S/Inputs/memprof.exe -o %t.memprofdatarand 2>&1 | FileCheck %s --check-prefix=RAND ; RAND: random hotness seed =