Skip to content

Commit 772f768

Browse files
committed
[MemProf] Optionally discard small non-cold contexts
Adds a new option -memprof-callsite-cold-threshold that allows specifying a percent that will cause non-cold contexts to be discarded if the percent cold bytes at a callsite including that context exceeds the given threshold. Default is 100% (no discarding). This reduces the amount of cloning needed to expose cold allocation contexts when parts of the context are dominantly cold. This motivated the change in PR138792, since discarding a context might require a different decision about which not-cold contexts must be kept to expose cloning requirements, so we need to determine that on the fly. Additionally, this required a change to include the context size information in the alloc trie in more cases, so we now guard the inclusion of this information in the generated metadata on the option values.
1 parent 09c80e2 commit 772f768

File tree

4 files changed

+258
-26
lines changed

4 files changed

+258
-26
lines changed

llvm/include/llvm/Analysis/MemoryProfileInfo.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,8 @@ class CallStackTrie {
103103
bool buildMIBNodes(CallStackTrieNode *Node, LLVMContext &Ctx,
104104
std::vector<uint64_t> &MIBCallStack,
105105
std::vector<Metadata *> &MIBNodes,
106-
bool CalleeHasAmbiguousCallerContext);
106+
bool CalleeHasAmbiguousCallerContext, uint64_t &TotalBytes,
107+
uint64_t &ColdBytes);
107108

108109
public:
109110
CallStackTrie() = default;

llvm/lib/Analysis/MemoryProfileInfo.cpp

Lines changed: 82 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
#include "llvm/Analysis/MemoryProfileInfo.h"
1414
#include "llvm/IR/Constants.h"
1515
#include "llvm/Support/CommandLine.h"
16+
#include "llvm/Support/Format.h"
1617

1718
using namespace llvm;
1819
using namespace llvm::memprof;
@@ -58,6 +59,19 @@ cl::opt<bool> MemProfKeepAllNotColdContexts(
5859
"memprof-keep-all-not-cold-contexts", cl::init(false), cl::Hidden,
5960
cl::desc("Keep all non-cold contexts (increases cloning overheads)"));
6061

62+
cl::opt<unsigned> MinClonedColdBytePercent(
63+
"memprof-cloning-cold-threshold", cl::init(100), cl::Hidden,
64+
cl::desc("Min percent of cold bytes to hint alloc cold during cloning"));
65+
66+
// Discard non-cold contexts if they overlap with much larger cold contexts,
67+
// specifically, if all contexts reaching a given callsite are at least this
68+
// percent cold byte allocations. This reduces the amount of cloning required
69+
// to expose the cold contexts when they greatly dominate non-cold contexts.
70+
cl::opt<unsigned> MinCallsiteColdBytePercent(
71+
"memprof-callsite-cold-threshold", cl::init(100), cl::Hidden,
72+
cl::desc("Min percent of cold bytes at a callsite to discard non-cold "
73+
"contexts"));
74+
6175
AllocationType llvm::memprof::getAllocType(uint64_t TotalLifetimeAccessDensity,
6276
uint64_t AllocCount,
6377
uint64_t TotalLifetime) {
@@ -208,21 +222,32 @@ void CallStackTrie::addCallStack(MDNode *MIB) {
208222

209223
static MDNode *createMIBNode(LLVMContext &Ctx, ArrayRef<uint64_t> MIBCallStack,
210224
AllocationType AllocType,
211-
ArrayRef<ContextTotalSize> ContextSizeInfo) {
225+
ArrayRef<ContextTotalSize> ContextSizeInfo,
226+
uint64_t &TotalBytes, uint64_t &ColdBytes) {
212227
SmallVector<Metadata *> MIBPayload(
213228
{buildCallstackMetadata(MIBCallStack, Ctx)});
214229
MIBPayload.push_back(
215230
MDString::get(Ctx, getAllocTypeAttributeString(AllocType)));
216231
if (!ContextSizeInfo.empty()) {
217232
for (const auto &[FullStackId, TotalSize] : ContextSizeInfo) {
218-
auto *FullStackIdMD = ValueAsMetadata::get(
219-
ConstantInt::get(Type::getInt64Ty(Ctx), FullStackId));
220-
auto *TotalSizeMD = ValueAsMetadata::get(
221-
ConstantInt::get(Type::getInt64Ty(Ctx), TotalSize));
222-
auto *ContextSizeMD = MDNode::get(Ctx, {FullStackIdMD, TotalSizeMD});
223-
MIBPayload.push_back(ContextSizeMD);
233+
TotalBytes += TotalSize;
234+
if (AllocType == AllocationType::Cold)
235+
ColdBytes += TotalSize;
236+
// Only add the context size info as metadata if we need it in the thin
237+
// link (currently if reporting of hinted sizes is enabled or we have
238+
// specified a threshold for marking allocations cold after cloning).
239+
if (MemProfReportHintedSizes || MinClonedColdBytePercent < 100) {
240+
auto *FullStackIdMD = ValueAsMetadata::get(
241+
ConstantInt::get(Type::getInt64Ty(Ctx), FullStackId));
242+
auto *TotalSizeMD = ValueAsMetadata::get(
243+
ConstantInt::get(Type::getInt64Ty(Ctx), TotalSize));
244+
auto *ContextSizeMD = MDNode::get(Ctx, {FullStackIdMD, TotalSizeMD});
245+
MIBPayload.push_back(ContextSizeMD);
246+
}
224247
}
225248
}
249+
assert(MinCallsiteColdBytePercent >= 100 ||
250+
(!ContextSizeInfo.empty() && TotalBytes > 0));
226251
return MDNode::get(Ctx, MIBPayload);
227252
}
228253

@@ -246,9 +271,13 @@ void CallStackTrie::convertHotToNotCold(CallStackTrieNode *Node) {
246271
// on options that enable filtering out some NotCold contexts.
247272
static void saveFilteredNewMIBNodes(std::vector<Metadata *> &NewMIBNodes,
248273
std::vector<Metadata *> &SavedMIBNodes,
249-
unsigned CallerContextLength) {
274+
unsigned CallerContextLength,
275+
uint64_t TotalBytes, uint64_t ColdBytes) {
276+
bool MostlyCold = MinCallsiteColdBytePercent < 100 &&
277+
ColdBytes * 100 >= MinCallsiteColdBytePercent * TotalBytes;
278+
250279
// In the simplest case, with pruning disabled, keep all the new MIB nodes.
251-
if (MemProfKeepAllNotColdContexts) {
280+
if (MemProfKeepAllNotColdContexts && !MostlyCold) {
252281
append_range(SavedMIBNodes, NewMIBNodes);
253282
return;
254283
}
@@ -271,6 +300,27 @@ static void saveFilteredNewMIBNodes(std::vector<Metadata *> &NewMIBNodes,
271300
}
272301
};
273302

303+
if (MostlyCold) {
304+
auto NewColdMIBNodes =
305+
make_filter_range(NewMIBNodes, [&](const Metadata *M) {
306+
auto MIBMD = cast<MDNode>(M);
307+
// Only append cold contexts.
308+
if (getMIBAllocType(MIBMD) == AllocationType::Cold)
309+
return true;
310+
if (MemProfReportHintedSizes) {
311+
float PercentCold = ColdBytes * 100.0 / TotalBytes;
312+
std::string PercentStr;
313+
llvm::raw_string_ostream OS(PercentStr);
314+
OS << format(" for %5.2f%% cold bytes", PercentCold);
315+
EmitMessageForRemovedContexts(MIBMD, "discarded", OS.str());
316+
}
317+
return false;
318+
});
319+
for (auto *M : NewColdMIBNodes)
320+
SavedMIBNodes.push_back(M);
321+
return;
322+
}
323+
274324
// Prune unneeded NotCold contexts, taking advantage of the fact
275325
// that we later will only clone Cold contexts, as NotCold is the allocation
276326
// default. We only need to keep as metadata the NotCold contexts that
@@ -341,17 +391,20 @@ static void saveFilteredNewMIBNodes(std::vector<Metadata *> &NewMIBNodes,
341391
// Recursive helper to trim contexts and create metadata nodes.
342392
// Caller should have pushed Node's loc to MIBCallStack. Doing this in the
343393
// caller makes it simpler to handle the many early returns in this method.
394+
// Updates the total and cold profiled bytes in the subtrie rooted at this node.
344395
bool CallStackTrie::buildMIBNodes(CallStackTrieNode *Node, LLVMContext &Ctx,
345396
std::vector<uint64_t> &MIBCallStack,
346397
std::vector<Metadata *> &MIBNodes,
347-
bool CalleeHasAmbiguousCallerContext) {
398+
bool CalleeHasAmbiguousCallerContext,
399+
uint64_t &TotalBytes, uint64_t &ColdBytes) {
348400
// Trim context below the first node in a prefix with a single alloc type.
349401
// Add an MIB record for the current call stack prefix.
350402
if (hasSingleAllocType(Node->AllocTypes)) {
351403
std::vector<ContextTotalSize> ContextSizeInfo;
352404
collectContextSizeInfo(Node, ContextSizeInfo);
353-
MIBNodes.push_back(createMIBNode(
354-
Ctx, MIBCallStack, (AllocationType)Node->AllocTypes, ContextSizeInfo));
405+
MIBNodes.push_back(createMIBNode(Ctx, MIBCallStack,
406+
(AllocationType)Node->AllocTypes,
407+
ContextSizeInfo, TotalBytes, ColdBytes));
355408
return true;
356409
}
357410

@@ -364,17 +417,25 @@ bool CallStackTrie::buildMIBNodes(CallStackTrieNode *Node, LLVMContext &Ctx,
364417
// that will later be filtered before adding to the caller's MIBNodes
365418
// vector.
366419
std::vector<Metadata *> NewMIBNodes;
420+
// Determine the total and cold byte counts for all callers, then add to the
421+
// caller's counts further below.
422+
uint64_t CallerTotalBytes = 0;
423+
uint64_t CallerColdBytes = 0;
367424
for (auto &Caller : Node->Callers) {
368425
MIBCallStack.push_back(Caller.first);
369-
AddedMIBNodesForAllCallerContexts &=
370-
buildMIBNodes(Caller.second, Ctx, MIBCallStack, NewMIBNodes,
371-
NodeHasAmbiguousCallerContext);
426+
AddedMIBNodesForAllCallerContexts &= buildMIBNodes(
427+
Caller.second, Ctx, MIBCallStack, NewMIBNodes,
428+
NodeHasAmbiguousCallerContext, CallerTotalBytes, CallerColdBytes);
372429
// Remove Caller.
373430
MIBCallStack.pop_back();
374431
}
375432
// Pass in the stack length of the MIB nodes added for the immediate caller,
376433
// which is the current stack length plus 1.
377-
saveFilteredNewMIBNodes(NewMIBNodes, MIBNodes, MIBCallStack.size() + 1);
434+
saveFilteredNewMIBNodes(NewMIBNodes, MIBNodes, MIBCallStack.size() + 1,
435+
CallerTotalBytes, CallerColdBytes);
436+
TotalBytes += CallerTotalBytes;
437+
ColdBytes += CallerColdBytes;
438+
378439
if (AddedMIBNodesForAllCallerContexts)
379440
return true;
380441
// We expect that the callers should be forced to add MIBs to disambiguate
@@ -397,7 +458,7 @@ bool CallStackTrie::buildMIBNodes(CallStackTrieNode *Node, LLVMContext &Ctx,
397458
std::vector<ContextTotalSize> ContextSizeInfo;
398459
collectContextSizeInfo(Node, ContextSizeInfo);
399460
MIBNodes.push_back(createMIBNode(Ctx, MIBCallStack, AllocationType::NotCold,
400-
ContextSizeInfo));
461+
ContextSizeInfo, TotalBytes, ColdBytes));
401462
return true;
402463
}
403464

@@ -444,12 +505,15 @@ bool CallStackTrie::buildAndAttachMIBMetadata(CallBase *CI) {
444505
std::vector<uint64_t> MIBCallStack;
445506
MIBCallStack.push_back(AllocStackId);
446507
std::vector<Metadata *> MIBNodes;
508+
uint64_t TotalBytes = 0;
509+
uint64_t ColdBytes = 0;
447510
assert(!Alloc->Callers.empty() && "addCallStack has not been called yet");
448511
// The CalleeHasAmbiguousCallerContext flag is meant to say whether the
449512
// callee of the given node has more than one caller. Here the node being
450513
// passed in is the alloc and it has no callees. So it's false.
451514
if (buildMIBNodes(Alloc, Ctx, MIBCallStack, MIBNodes,
452-
/*CalleeHasAmbiguousCallerContext=*/false)) {
515+
/*CalleeHasAmbiguousCallerContext=*/false, TotalBytes,
516+
ColdBytes)) {
453517
assert(MIBCallStack.size() == 1 &&
454518
"Should only be left with Alloc's location in stack");
455519
CI->setMetadata(LLVMContext::MD_memprof, MDNode::get(Ctx, MIBNodes));

llvm/lib/Transforms/Instrumentation/MemProfiler.cpp

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -176,11 +176,9 @@ static cl::opt<bool>
176176
cl::desc("Salvage stale MemProf profile"),
177177
cl::init(false), cl::Hidden);
178178

179-
cl::opt<unsigned> MinClonedColdBytePercent(
180-
"memprof-cloning-cold-threshold", cl::init(100), cl::Hidden,
181-
cl::desc("Min percent of cold bytes to hint alloc cold during cloning"));
182-
183179
extern cl::opt<bool> MemProfReportHintedSizes;
180+
extern cl::opt<unsigned> MinClonedColdBytePercent;
181+
extern cl::opt<unsigned> MinCallsiteColdBytePercent;
184182

185183
static cl::opt<unsigned> MinMatchedColdBytePercent(
186184
"memprof-matching-cold-threshold", cl::init(100), cl::Hidden,
@@ -293,6 +291,13 @@ class ModuleMemProfiler {
293291
Function *MemProfCtorFunction = nullptr;
294292
};
295293

294+
// Options under which we need to record the context size info in the alloc trie
295+
// used to build metadata.
296+
bool recordContextSizeInfo() {
297+
return MemProfReportHintedSizes || MinClonedColdBytePercent < 100 ||
298+
MinCallsiteColdBytePercent < 100;
299+
}
300+
296301
} // end anonymous namespace
297302

298303
MemProfilerPass::MemProfilerPass() = default;
@@ -758,7 +763,7 @@ static AllocationType addCallStack(CallStackTrie &AllocTrie,
758763
AllocInfo->Info.getAllocCount(),
759764
AllocInfo->Info.getTotalLifetime());
760765
std::vector<ContextTotalSize> ContextSizeInfo;
761-
if (MemProfReportHintedSizes || MinClonedColdBytePercent < 100) {
766+
if (recordContextSizeInfo()) {
762767
auto TotalSize = AllocInfo->Info.getTotalSize();
763768
assert(TotalSize);
764769
assert(FullStackId != 0);
@@ -1141,8 +1146,7 @@ readMemprof(Module &M, Function &F, IndexedInstrProfReader *MemProfReader,
11411146
InlinedCallStack)) {
11421147
NumOfMemProfMatchedAllocContexts++;
11431148
uint64_t FullStackId = 0;
1144-
if (ClPrintMemProfMatchInfo || MemProfReportHintedSizes ||
1145-
MinClonedColdBytePercent < 100)
1149+
if (ClPrintMemProfMatchInfo || recordContextSizeInfo())
11461150
FullStackId = computeFullStackId(AllocInfo->CallStack);
11471151
auto AllocType = addCallStack(AllocTrie, AllocInfo, FullStackId);
11481152
TotalSize += AllocInfo->Info.getTotalSize();

0 commit comments

Comments
 (0)