Skip to content

Commit 1de7165

Browse files
[MemProf] Support cloning for indirect calls with ThinLTO (#110625)
This patch enables support for cloning in indirect callsites. This is done by synthesizing callsite records for each virtual call target from the profile metadata. In the thin link all the synthesized records for a particular indirect callsite initially share the same context node, but support is added to partition the callsites and outgoing edges based on the callee function, creating a separate node for each target. In the LTO backend, when cloning is needed we first perform indirect call promotion, then change the target of the new direct call to the desired clone. Note this is ThinLTO-specific, since for regular LTO indirect call promotion should have already occurred.
1 parent 111b062 commit 1de7165

File tree

7 files changed

+919
-53
lines changed

7 files changed

+919
-53
lines changed

llvm/include/llvm/IR/ModuleSummaryIndex.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -200,7 +200,12 @@ struct ValueInfo {
200200
return getRef()->second.SummaryList;
201201
}
202202

203+
// Even if the index is built with GVs available, we may not have one for
204+
// summary entries synthesized for profiled indirect call targets.
205+
bool hasName() const { return !haveGVs() || getValue(); }
206+
203207
StringRef name() const {
208+
assert(!haveGVs() || getRef()->second.U.GV);
204209
return haveGVs() ? getRef()->second.U.GV->getName()
205210
: getRef()->second.U.Name;
206211
}

llvm/include/llvm/Transforms/IPO/MemProfContextDisambiguation.h

Lines changed: 44 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,11 @@
1515
#ifndef LLVM_TRANSFORMS_IPO_MEMPROF_CONTEXT_DISAMBIGUATION_H
1616
#define LLVM_TRANSFORMS_IPO_MEMPROF_CONTEXT_DISAMBIGUATION_H
1717

18+
#include "llvm/Analysis/IndirectCallPromotionAnalysis.h"
1819
#include "llvm/IR/GlobalValue.h"
1920
#include "llvm/IR/ModuleSummaryIndex.h"
2021
#include "llvm/IR/PassManager.h"
22+
#include "llvm/Transforms/Utils/ValueMapper.h"
2123
#include <functional>
2224

2325
namespace llvm {
@@ -36,15 +38,56 @@ class MemProfContextDisambiguation
3638
/// the IR.
3739
bool applyImport(Module &M);
3840

41+
// Builds the symtab and analysis used for ICP during ThinLTO backends.
42+
bool initializeIndirectCallPromotionInfo(Module &M);
43+
44+
// Data structure for saving indirect call profile info for use in ICP with
45+
// cloning.
46+
struct ICallAnalysisData {
47+
CallBase *CB;
48+
std::vector<InstrProfValueData> CandidateProfileData;
49+
uint32_t NumCandidates;
50+
uint64_t TotalCount;
51+
size_t CallsiteInfoStartIndex;
52+
};
53+
54+
// Record information needed for ICP of an indirect call, depending on its
55+
// profile information and the clone information recorded in the corresponding
56+
// CallsiteInfo records. The SI iterator point to the current iteration point
57+
// through AllCallsites in this function, and will be updated in this method
58+
// as we iterate through profiled targets. The number of clones recorded for
59+
// this indirect call is returned. The necessary information is recorded in
60+
// the ICallAnalysisInfo list for later ICP.
61+
unsigned recordICPInfo(CallBase *CB, ArrayRef<CallsiteInfo> AllCallsites,
62+
ArrayRef<CallsiteInfo>::iterator &SI,
63+
SmallVector<ICallAnalysisData> &ICallAnalysisInfo);
64+
65+
// Actually performs any needed ICP in the function, using the information
66+
// recorded in the ICallAnalysisInfo list.
67+
void performICP(Module &M, ArrayRef<CallsiteInfo> AllCallsites,
68+
ArrayRef<std::unique_ptr<ValueToValueMapTy>> VMaps,
69+
ArrayRef<ICallAnalysisData> ICallAnalysisInfo,
70+
OptimizationRemarkEmitter &ORE);
71+
3972
/// Import summary containing cloning decisions for the ThinLTO backend.
4073
const ModuleSummaryIndex *ImportSummary;
4174

4275
// Owns the import summary specified by internal options for testing the
4376
// ThinLTO backend via opt (to simulate distributed ThinLTO).
4477
std::unique_ptr<ModuleSummaryIndex> ImportSummaryForTesting;
4578

79+
// Whether we are building with SamplePGO. This is needed for correctly
80+
// updating profile metadata on speculatively promoted calls.
81+
bool isSamplePGO;
82+
83+
// Used when performing indirect call analysis and promotion when cloning in
84+
// the ThinLTO backend during applyImport.
85+
std::unique_ptr<InstrProfSymtab> Symtab;
86+
std::unique_ptr<ICallPromotionAnalysis> ICallAnalysis;
87+
4688
public:
47-
MemProfContextDisambiguation(const ModuleSummaryIndex *Summary = nullptr);
89+
MemProfContextDisambiguation(const ModuleSummaryIndex *Summary = nullptr,
90+
bool isSamplePGO = false);
4891

4992
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
5093

llvm/lib/Analysis/ModuleSummaryAnalysis.cpp

Lines changed: 36 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,11 @@ static cl::opt<std::string> ModuleSummaryDotFile(
8181
"module-summary-dot-file", cl::Hidden, cl::value_desc("filename"),
8282
cl::desc("File to emit dot graph of new summary into"));
8383

84+
static cl::opt<bool> EnableMemProfIndirectCallSupport(
85+
"enable-memprof-indirect-call-support", cl::init(true), cl::Hidden,
86+
cl::desc(
87+
"Enable MemProf support for summarizing and cloning indirect calls"));
88+
8489
extern cl::opt<bool> ScalePartialSampleProfileWorkingSetSize;
8590

8691
extern cl::opt<unsigned> MaxNumVTableAnnotations;
@@ -404,6 +409,11 @@ static void computeFunctionSummary(
404409
if (HasLocalsInUsedOrAsm && CI && CI->isInlineAsm())
405410
HasInlineAsmMaybeReferencingInternal = true;
406411

412+
// Compute this once per indirect call.
413+
uint32_t NumCandidates = 0;
414+
uint64_t TotalCount = 0;
415+
MutableArrayRef<InstrProfValueData> CandidateProfileData;
416+
407417
auto *CalledValue = CB->getCalledOperand();
408418
auto *CalledFunction = CB->getCalledFunction();
409419
if (CalledValue && !CalledFunction) {
@@ -481,9 +491,7 @@ static void computeFunctionSummary(
481491
}
482492
}
483493

484-
uint32_t NumCandidates;
485-
uint64_t TotalCount;
486-
auto CandidateProfileData =
494+
CandidateProfileData =
487495
ICallAnalysis.getPromotionCandidatesForInstruction(&I, TotalCount,
488496
NumCandidates);
489497
for (const auto &Candidate : CandidateProfileData)
@@ -495,16 +503,6 @@ static void computeFunctionSummary(
495503
if (!IsThinLTO)
496504
continue;
497505

498-
// TODO: Skip indirect calls for now. Need to handle these better, likely
499-
// by creating multiple Callsites, one per target, then speculatively
500-
// devirtualize while applying clone info in the ThinLTO backends. This
501-
// will also be important because we will have a different set of clone
502-
// versions per target. This handling needs to match that in the ThinLTO
503-
// backend so we handle things consistently for matching of callsite
504-
// summaries to instructions.
505-
if (!CalledFunction)
506-
continue;
507-
508506
// Ensure we keep this analysis in sync with the handling in the ThinLTO
509507
// backend (see MemProfContextDisambiguation::applyImport). Save this call
510508
// so that we can skip it in checking the reverse case later.
@@ -555,13 +553,24 @@ static void computeFunctionSummary(
555553
SmallVector<unsigned> StackIdIndices;
556554
for (auto StackId : InstCallsite)
557555
StackIdIndices.push_back(Index.addOrGetStackIdIndex(StackId));
558-
// Use the original CalledValue, in case it was an alias. We want
559-
// to record the call edge to the alias in that case. Eventually
560-
// an alias summary will be created to associate the alias and
561-
// aliasee.
562-
auto CalleeValueInfo =
563-
Index.getOrInsertValueInfo(cast<GlobalValue>(CalledValue));
564-
Callsites.push_back({CalleeValueInfo, StackIdIndices});
556+
if (CalledFunction) {
557+
// Use the original CalledValue, in case it was an alias. We want
558+
// to record the call edge to the alias in that case. Eventually
559+
// an alias summary will be created to associate the alias and
560+
// aliasee.
561+
auto CalleeValueInfo =
562+
Index.getOrInsertValueInfo(cast<GlobalValue>(CalledValue));
563+
Callsites.push_back({CalleeValueInfo, StackIdIndices});
564+
} else if (EnableMemProfIndirectCallSupport) {
565+
// For indirect callsites, create multiple Callsites, one per target.
566+
// This enables having a different set of clone versions per target,
567+
// and we will apply the cloning decisions while speculatively
568+
// devirtualizing in the ThinLTO backends.
569+
for (const auto &Candidate : CandidateProfileData) {
570+
auto CalleeValueInfo = Index.getOrInsertValueInfo(Candidate.Value);
571+
Callsites.push_back({CalleeValueInfo, StackIdIndices});
572+
}
573+
}
565574
}
566575
}
567576
}
@@ -1214,9 +1223,13 @@ bool llvm::mayHaveMemprofSummary(const CallBase *CB) {
12141223
if (CI && CalledFunction->isIntrinsic())
12151224
return false;
12161225
} else {
1217-
// TODO: For now skip indirect calls. See comments in
1218-
// computeFunctionSummary for what is needed to handle this.
1219-
return false;
1226+
// Skip inline assembly calls.
1227+
if (CI && CI->isInlineAsm())
1228+
return false;
1229+
// Skip direct calls via Constant.
1230+
if (!CalledValue || isa<Constant>(CalledValue))
1231+
return false;
1232+
return true;
12201233
}
12211234
return true;
12221235
}

llvm/lib/IR/AsmWriter.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3609,7 +3609,7 @@ void AssemblyWriter::printSummary(const GlobalValueSummary &Summary) {
36093609

36103610
void AssemblyWriter::printSummaryInfo(unsigned Slot, const ValueInfo &VI) {
36113611
Out << "^" << Slot << " = gv: (";
3612-
if (!VI.name().empty())
3612+
if (VI.hasName() && !VI.name().empty())
36133613
Out << "name: \"" << VI.name() << "\"";
36143614
else
36153615
Out << "guid: " << VI.getGUID();
@@ -3623,7 +3623,7 @@ void AssemblyWriter::printSummaryInfo(unsigned Slot, const ValueInfo &VI) {
36233623
Out << ")";
36243624
}
36253625
Out << ")";
3626-
if (!VI.name().empty())
3626+
if (VI.hasName() && !VI.name().empty())
36273627
Out << " ; guid = " << VI.getGUID();
36283628
Out << "\n";
36293629
}

llvm/lib/Passes/PassBuilderPipelines.cpp

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1710,7 +1710,8 @@ ModulePassManager PassBuilder::buildThinLTODefaultPipeline(
17101710
// For ThinLTO we must apply the context disambiguation decisions early, to
17111711
// ensure we can correctly match the callsites to summary data.
17121712
if (EnableMemProfContextDisambiguation)
1713-
MPM.addPass(MemProfContextDisambiguation(ImportSummary));
1713+
MPM.addPass(MemProfContextDisambiguation(
1714+
ImportSummary, PGOOpt && PGOOpt->Action == PGOOptions::SampleUse));
17141715

17151716
// These passes import type identifier resolutions for whole-program
17161717
// devirtualization and CFI. They must run early because other passes may
@@ -1923,7 +1924,9 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level,
19231924
// amount of additional cloning required to distinguish the allocation
19241925
// contexts.
19251926
if (EnableMemProfContextDisambiguation)
1926-
MPM.addPass(MemProfContextDisambiguation());
1927+
MPM.addPass(MemProfContextDisambiguation(
1928+
/*Summary=*/nullptr,
1929+
PGOOpt && PGOOpt->Action == PGOOptions::SampleUse));
19271930

19281931
// Optimize globals again after we ran the inliner.
19291932
MPM.addPass(GlobalOptPass());

0 commit comments

Comments
 (0)