Skip to content

Commit b96f791

Browse files
committed
[MemProf] Suppress duplicate clones in the LTO backend
In some cases due to phase ordering issues with re-cloning during function assignment, we may end up with duplicate clones in the summaries (calling the same set of callee clones and/or allocation hints). Ideally we would fix this in the thin link, but for now, detect and suppress these in the LTO backend. In order to satisfy possibly cross-module references, make each duplicate an alias to the first identical copy, which gets materialized. This reduces ThinLTO backend compile times.
1 parent a85d3a5 commit b96f791

File tree

3 files changed

+279
-41
lines changed

3 files changed

+279
-41
lines changed

llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp

Lines changed: 136 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
#include "llvm/ADT/SmallSet.h"
3030
#include "llvm/ADT/SmallVector.h"
3131
#include "llvm/ADT/Statistic.h"
32+
#include "llvm/ADT/StringExtras.h"
3233
#include "llvm/Analysis/MemoryProfileInfo.h"
3334
#include "llvm/Analysis/ModuleSummaryAnalysis.h"
3435
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
@@ -40,6 +41,7 @@
4041
#include "llvm/Support/CommandLine.h"
4142
#include "llvm/Support/GraphWriter.h"
4243
#include "llvm/Support/InterleavedRange.h"
44+
#include "llvm/Support/SHA1.h"
4345
#include "llvm/Support/raw_ostream.h"
4446
#include "llvm/Transforms/IPO.h"
4547
#include "llvm/Transforms/Utils/CallPromotionUtils.h"
@@ -60,6 +62,9 @@ STATISTIC(FunctionClonesThinBackend,
6062
"Number of function clones created during ThinLTO backend");
6163
STATISTIC(FunctionsClonedThinBackend,
6264
"Number of functions that had clones created during ThinLTO backend");
65+
STATISTIC(
66+
FunctionCloneDuplicatesThinBackend,
67+
"Number of function clone duplicates detected during ThinLTO backend");
6368
STATISTIC(AllocTypeNotCold, "Number of not cold static allocations (possibly "
6469
"cloned) during whole program analysis");
6570
STATISTIC(AllocTypeCold, "Number of cold static allocations (possibly cloned) "
@@ -5186,19 +5191,129 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
51865191
return Changed;
51875192
}
51885193

5194+
// Compute a SHA1 hash of the callsite and alloc version information of clone I
5195+
// in the summary, to use in detection of duplicate clones.
5196+
std::string ComputeHash(StringMap<Function *> &HashToFunc, FunctionSummary *FS,
5197+
unsigned I) {
5198+
SHA1 Hasher;
5199+
// Update hash with any callsites that call non-default (non-zero) callee
5200+
// versions.
5201+
for (auto &SN : FS->callsites()) {
5202+
// In theory all callsites and allocs in this function should have the same
5203+
// number of clone entries, but handle any discrepancies gracefully below
5204+
// for NDEBUG builds.
5205+
assert(
5206+
SN.Clones.size() > I &&
5207+
"Callsite summary has fewer entries than other summaries in function");
5208+
if (SN.Clones.size() <= I || !SN.Clones[I])
5209+
continue;
5210+
uint8_t Data[4];
5211+
support::endian::write32le(Data, SN.Clones[I]);
5212+
Hasher.update(Data);
5213+
}
5214+
// Update hash with any allocs that have non-default (non-None) hints.
5215+
for (auto &AN : FS->allocs()) {
5216+
// In theory all callsites and allocs in this function should have the same
5217+
// number of clone entries, but handle any discrepancies gracefully below
5218+
// for NDEBUG builds.
5219+
assert(AN.Versions.size() > I &&
5220+
"Alloc summary has fewer entries than other summaries in function");
5221+
if (AN.Versions.size() <= I ||
5222+
(AllocationType)AN.Versions[I] == AllocationType::None)
5223+
continue;
5224+
Hasher.update(ArrayRef<uint8_t>(&AN.Versions[I], 1));
5225+
}
5226+
return toHex(Hasher.result());
5227+
}
5228+
51895229
static SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> createFunctionClones(
51905230
Function &F, unsigned NumClones, Module &M, OptimizationRemarkEmitter &ORE,
51915231
std::map<const Function *, SmallPtrSet<const GlobalAlias *, 1>>
5192-
&FuncToAliasMap) {
5232+
&FuncToAliasMap,
5233+
FunctionSummary *FS) {
5234+
auto TakeDeclNameAndReplace = [](GlobalValue *DeclGV, GlobalValue *NewGV) {
5235+
// We might have created this when adjusting callsite in another
5236+
// function. It should be a declaration.
5237+
assert(DeclGV->isDeclaration());
5238+
NewGV->takeName(DeclGV);
5239+
DeclGV->replaceAllUsesWith(NewGV);
5240+
DeclGV->eraseFromParent();
5241+
};
5242+
5243+
// Handle aliases to this function, and create analogous alias clones to the
5244+
// provided clone of this function.
5245+
auto CloneFuncAliases = [&](Function *NewF, unsigned I) {
5246+
if (!FuncToAliasMap.count(&F))
5247+
return;
5248+
for (auto *A : FuncToAliasMap[&F]) {
5249+
std::string AliasName = getMemProfFuncName(A->getName(), I);
5250+
auto *PrevA = M.getNamedAlias(AliasName);
5251+
auto *NewA = GlobalAlias::create(A->getValueType(),
5252+
A->getType()->getPointerAddressSpace(),
5253+
A->getLinkage(), AliasName, NewF);
5254+
NewA->copyAttributesFrom(A);
5255+
if (PrevA)
5256+
TakeDeclNameAndReplace(PrevA, NewA);
5257+
}
5258+
};
5259+
51935260
// The first "clone" is the original copy, we should only call this if we
51945261
// needed to create new clones.
51955262
assert(NumClones > 1);
51965263
SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> VMaps;
51975264
VMaps.reserve(NumClones - 1);
51985265
FunctionsClonedThinBackend++;
5266+
5267+
// Map of hash of callsite/alloc versions to the instantiated function clone
5268+
// (possibly the original) implementing those calls. Used to avoid
5269+
// instantiating duplicate function clones.
5270+
// FIXME: Ideally the thin link would not generate such duplicate clones to
5271+
// start with, but right now it happens due to phase ordering in the function
5272+
// assignment and possible new clones that produces. We simply make each
5273+
// duplicate an alias to the matching instantiated clone recorded in the map
5274+
// (except for available_externally which are made declarations as they would
5275+
// be aliases in the prevailing module, and available_externally aliases are
5276+
// not well supported right now).
5277+
StringMap<Function *> HashToFunc;
5278+
5279+
// Save the hash of the original function version.
5280+
auto Hash = ComputeHash(HashToFunc, FS, 0);
5281+
HashToFunc[Hash] = &F;
5282+
51995283
for (unsigned I = 1; I < NumClones; I++) {
52005284
VMaps.emplace_back(std::make_unique<ValueToValueMapTy>());
5285+
std::string Name = getMemProfFuncName(F.getName(), I);
5286+
auto Hash = ComputeHash(HashToFunc, FS, I);
5287+
// If this clone would duplicate a previously seen clone, don't generate the
5288+
// duplicate clone body, just make an alias to satisfy any (potentially
5289+
// cross-module) references.
5290+
if (HashToFunc.contains(Hash)) {
5291+
FunctionCloneDuplicatesThinBackend++;
5292+
auto *Func = HashToFunc[Hash];
5293+
if (Func->hasAvailableExternallyLinkage()) {
5294+
// Skip these as EliminateAvailableExternallyPass does not handle
5295+
// available_externally aliases correctly and we end up with an
5296+
// available_externally alias to a declaration. Just create a
5297+
// declaration for now as we know we will have a definition in another
5298+
// module.
5299+
auto Decl = M.getOrInsertFunction(Name, Func->getFunctionType());
5300+
ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", &F)
5301+
<< "created clone decl " << ore::NV("Decl", Decl.getCallee()));
5302+
continue;
5303+
}
5304+
auto *PrevF = M.getFunction(Name);
5305+
auto *Alias = GlobalAlias::create(Name, Func);
5306+
if (PrevF)
5307+
TakeDeclNameAndReplace(PrevF, Alias);
5308+
ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", &F)
5309+
<< "created clone alias " << ore::NV("Alias", Alias));
5310+
5311+
// Now handle aliases to this function, and clone those as well.
5312+
CloneFuncAliases(Func, I);
5313+
continue;
5314+
}
52015315
auto *NewF = CloneFunction(&F, *VMaps.back());
5316+
HashToFunc[Hash] = NewF;
52025317
FunctionClonesThinBackend++;
52035318
// Strip memprof and callsite metadata from clone as they are no longer
52045319
// needed.
@@ -5208,40 +5323,17 @@ static SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> createFunctionClones(
52085323
Inst.setMetadata(LLVMContext::MD_callsite, nullptr);
52095324
}
52105325
}
5211-
std::string Name = getMemProfFuncName(F.getName(), I);
52125326
auto *PrevF = M.getFunction(Name);
5213-
if (PrevF) {
5214-
// We might have created this when adjusting callsite in another
5215-
// function. It should be a declaration.
5216-
assert(PrevF->isDeclaration());
5217-
NewF->takeName(PrevF);
5218-
PrevF->replaceAllUsesWith(NewF);
5219-
PrevF->eraseFromParent();
5220-
} else
5327+
if (PrevF)
5328+
TakeDeclNameAndReplace(PrevF, NewF);
5329+
else
52215330
NewF->setName(Name);
52225331
updateSubprogramLinkageName(NewF, Name);
52235332
ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", &F)
52245333
<< "created clone " << ore::NV("NewFunction", NewF));
52255334

52265335
// Now handle aliases to this function, and clone those as well.
5227-
if (!FuncToAliasMap.count(&F))
5228-
continue;
5229-
for (auto *A : FuncToAliasMap[&F]) {
5230-
std::string Name = getMemProfFuncName(A->getName(), I);
5231-
auto *PrevA = M.getNamedAlias(Name);
5232-
auto *NewA = GlobalAlias::create(A->getValueType(),
5233-
A->getType()->getPointerAddressSpace(),
5234-
A->getLinkage(), Name, NewF);
5235-
NewA->copyAttributesFrom(A);
5236-
if (PrevA) {
5237-
// We might have created this when adjusting callsite in another
5238-
// function. It should be a declaration.
5239-
assert(PrevA->isDeclaration());
5240-
NewA->takeName(PrevA);
5241-
PrevA->replaceAllUsesWith(NewA);
5242-
PrevA->eraseFromParent();
5243-
}
5244-
}
5336+
CloneFuncAliases(NewF, I);
52455337
}
52465338
return VMaps;
52475339
}
@@ -5401,7 +5493,7 @@ bool MemProfContextDisambiguation::applyImport(Module &M) {
54015493
SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> VMaps;
54025494
bool ClonesCreated = false;
54035495
unsigned NumClonesCreated = 0;
5404-
auto CloneFuncIfNeeded = [&](unsigned NumClones) {
5496+
auto CloneFuncIfNeeded = [&](unsigned NumClones, FunctionSummary *FS) {
54055497
// We should at least have version 0 which is the original copy.
54065498
assert(NumClones > 0);
54075499
// If only one copy needed use original.
@@ -5415,7 +5507,7 @@ bool MemProfContextDisambiguation::applyImport(Module &M) {
54155507
assert(NumClonesCreated == NumClones);
54165508
return;
54175509
}
5418-
VMaps = createFunctionClones(F, NumClones, M, ORE, FuncToAliasMap);
5510+
VMaps = createFunctionClones(F, NumClones, M, ORE, FuncToAliasMap, FS);
54195511
// The first "clone" is the original copy, which doesn't have a VMap.
54205512
assert(VMaps.size() == NumClones - 1);
54215513
Changed = true;
@@ -5424,9 +5516,9 @@ bool MemProfContextDisambiguation::applyImport(Module &M) {
54245516
};
54255517

54265518
auto CloneCallsite = [&](const CallsiteInfo &StackNode, CallBase *CB,
5427-
Function *CalledFunction) {
5519+
Function *CalledFunction, FunctionSummary *FS) {
54285520
// Perform cloning if not yet done.
5429-
CloneFuncIfNeeded(/*NumClones=*/StackNode.Clones.size());
5521+
CloneFuncIfNeeded(/*NumClones=*/StackNode.Clones.size(), FS);
54305522

54315523
assert(!isMemProfClone(*CalledFunction));
54325524

@@ -5448,6 +5540,8 @@ bool MemProfContextDisambiguation::applyImport(Module &M) {
54485540
// below.
54495541
auto CalleeOrigName = CalledFunction->getName();
54505542
for (unsigned J = 0; J < StackNode.Clones.size(); J++) {
5543+
if (J > 0 && VMaps[J - 1]->empty())
5544+
continue;
54515545
// Do nothing if this version calls the original version of its
54525546
// callee.
54535547
if (!StackNode.Clones[J])
@@ -5591,7 +5685,7 @@ bool MemProfContextDisambiguation::applyImport(Module &M) {
55915685
#endif
55925686

55935687
// Perform cloning if not yet done.
5594-
CloneFuncIfNeeded(/*NumClones=*/AllocNode.Versions.size());
5688+
CloneFuncIfNeeded(/*NumClones=*/AllocNode.Versions.size(), FS);
55955689

55965690
OrigAllocsThinBackend++;
55975691
AllocVersionsThinBackend += AllocNode.Versions.size();
@@ -5624,6 +5718,8 @@ bool MemProfContextDisambiguation::applyImport(Module &M) {
56245718

56255719
// Update the allocation types per the summary info.
56265720
for (unsigned J = 0; J < AllocNode.Versions.size(); J++) {
5721+
if (J > 0 && VMaps[J - 1]->empty())
5722+
continue;
56275723
// Ignore any that didn't get an assigned allocation type.
56285724
if (AllocNode.Versions[J] == (uint8_t)AllocationType::None)
56295725
continue;
@@ -5671,7 +5767,7 @@ bool MemProfContextDisambiguation::applyImport(Module &M) {
56715767
// we don't need to do ICP, but might need to clone this
56725768
// function as it is the target of other cloned calls.
56735769
if (NumClones)
5674-
CloneFuncIfNeeded(NumClones);
5770+
CloneFuncIfNeeded(NumClones, FS);
56755771
}
56765772

56775773
else {
@@ -5691,7 +5787,7 @@ bool MemProfContextDisambiguation::applyImport(Module &M) {
56915787
}
56925788
#endif
56935789

5694-
CloneCallsite(StackNode, CB, CalledFunction);
5790+
CloneCallsite(StackNode, CB, CalledFunction, FS);
56955791
}
56965792
} else if (CB->isTailCall() && CalledFunction) {
56975793
// Locate the synthesized callsite info for the callee VI, if any was
@@ -5701,7 +5797,7 @@ bool MemProfContextDisambiguation::applyImport(Module &M) {
57015797
if (CalleeVI && MapTailCallCalleeVIToCallsite.count(CalleeVI)) {
57025798
auto Callsite = MapTailCallCalleeVIToCallsite.find(CalleeVI);
57035799
assert(Callsite != MapTailCallCalleeVIToCallsite.end());
5704-
CloneCallsite(Callsite->second, CB, CalledFunction);
5800+
CloneCallsite(Callsite->second, CB, CalledFunction, FS);
57055801
}
57065802
}
57075803
}
@@ -5847,6 +5943,8 @@ void MemProfContextDisambiguation::performICP(
58475943
// check.
58485944
CallBase *CBClone = CB;
58495945
for (unsigned J = 0; J < NumClones; J++) {
5946+
if (J > 0 && VMaps[J - 1]->empty())
5947+
continue;
58505948
// Copy 0 is the original function.
58515949
if (J > 0)
58525950
CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
@@ -5892,6 +5990,8 @@ void MemProfContextDisambiguation::performICP(
58925990
// TotalCount and the number promoted.
58935991
CallBase *CBClone = CB;
58945992
for (unsigned J = 0; J < NumClones; J++) {
5993+
if (J > 0 && VMaps[J - 1]->empty())
5994+
continue;
58955995
// Copy 0 is the original function.
58965996
if (J > 0)
58975997
CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);

0 commit comments

Comments
 (0)