Skip to content

Commit 1f5e201

Browse files
htyutstellar
authored andcommitted
[CSSPGO] Process functions in a top-down order on a dynamic call graph.
Functions are currently processed by the sample profiler loader in a top-down order defined by the static call graph. The order is being adjusted to be a top-down order based on the input context-sensitive profile. One benefit is that the processing order of caller and callee in one SCC would follow the context order in the profile to favor more inlining. Another benefit is that the processing order of caller and callee through an indirect call (which is not on the static call graph) can be honored which in turn allows for more inlining. The profile top-down order for SCC is also extended to support non-CS profiles. Two switches `-mllvm -use-profile-indirect-call-edges` and `-mllvm -use-profile-top-down-order` are being introduced. Reviewed By: wmi Differential Revision: https://reviews.llvm.org/D95988
1 parent 1a5bb1e commit 1f5e201

File tree

7 files changed

+620
-5
lines changed

7 files changed

+620
-5
lines changed

llvm/include/llvm/Transforms/IPO/SampleContextTracker.h

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
#include "llvm/ADT/SmallSet.h"
1919
#include "llvm/ADT/StringMap.h"
2020
#include "llvm/ADT/StringRef.h"
21+
#include "llvm/Analysis/CallGraph.h"
2122
#include "llvm/IR/DebugInfoMetadata.h"
2223
#include "llvm/IR/Instructions.h"
2324
#include "llvm/ProfileData/SampleProf.h"
@@ -90,6 +91,8 @@ class ContextTrieNode {
9091
// calling context and the context is identified by path from root to the node.
9192
class SampleContextTracker {
9293
public:
94+
using ContextSamplesTy = SmallSet<FunctionSamples *, 16>;
95+
9396
SampleContextTracker(StringMap<FunctionSamples> &Profiles);
9497
// Query context profile for a specific callee with given name at a given
9598
// call-site. The full context is identified by location of call instruction.
@@ -103,6 +106,9 @@ class SampleContextTracker {
103106
FunctionSamples *getContextSamplesFor(const DILocation *DIL);
104107
// Query context profile for a given sample contxt of a function.
105108
FunctionSamples *getContextSamplesFor(const SampleContext &Context);
109+
// Get all context profile for given function.
110+
ContextSamplesTy &getAllContextSamplesFor(const Function &Func);
111+
ContextSamplesTy &getAllContextSamplesFor(StringRef Name);
106112
// Query base profile for a given function. A base profile is a merged view
107113
// of all context profiles for contexts that are not inlined.
108114
FunctionSamples *getBaseSamplesFor(const Function &Func,
@@ -113,6 +119,9 @@ class SampleContextTracker {
113119
// This makes sure that inlined context profile will be excluded in
114120
// function's base profile.
115121
void markContextSamplesInlined(const FunctionSamples *InlinedSamples);
122+
void promoteMergeContextSamplesTree(const Instruction &Inst,
123+
StringRef CalleeName);
124+
void addCallGraphEdges(CallGraph &CG, StringMap<Function *> &SymbolMap);
116125
// Dump the internal context profile trie.
117126
void dump();
118127

@@ -126,16 +135,14 @@ class SampleContextTracker {
126135
ContextTrieNode *getTopLevelContextNode(StringRef FName);
127136
ContextTrieNode &addTopLevelContextNode(StringRef FName);
128137
ContextTrieNode &promoteMergeContextSamplesTree(ContextTrieNode &NodeToPromo);
129-
void promoteMergeContextSamplesTree(const Instruction &Inst,
130-
StringRef CalleeName);
131138
void mergeContextNode(ContextTrieNode &FromNode, ContextTrieNode &ToNode,
132139
StringRef ContextStrToRemove);
133140
ContextTrieNode &promoteMergeContextSamplesTree(ContextTrieNode &FromNode,
134141
ContextTrieNode &ToNodeParent,
135142
StringRef ContextStrToRemove);
136143

137144
// Map from function name to context profiles (excluding base profile)
138-
StringMap<SmallSet<FunctionSamples *, 16>> FuncToCtxtProfileSet;
145+
StringMap<ContextSamplesTy> FuncToCtxtProfileSet;
139146

140147
// Root node for context trie tree
141148
ContextTrieNode RootContext;

llvm/lib/Transforms/IPO/SampleContextTracker.cpp

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -263,6 +263,17 @@ SampleContextTracker::getContextSamplesFor(const SampleContext &Context) {
263263
return Node->getFunctionSamples();
264264
}
265265

266+
SampleContextTracker::ContextSamplesTy &
267+
SampleContextTracker::getAllContextSamplesFor(const Function &Func) {
268+
StringRef CanonName = FunctionSamples::getCanonicalFnName(Func);
269+
return FuncToCtxtProfileSet[CanonName];
270+
}
271+
272+
SampleContextTracker::ContextSamplesTy &
273+
SampleContextTracker::getAllContextSamplesFor(StringRef Name) {
274+
return FuncToCtxtProfileSet[Name];
275+
}
276+
266277
FunctionSamples *SampleContextTracker::getBaseSamplesFor(const Function &Func,
267278
bool MergeContext) {
268279
StringRef CanonName = FunctionSamples::getCanonicalFnName(Func);
@@ -550,4 +561,25 @@ ContextTrieNode &SampleContextTracker::promoteMergeContextSamplesTree(
550561
return *ToNode;
551562
}
552563

564+
// Replace call graph edges with dynamic call edges from the profile.
565+
void SampleContextTracker::addCallGraphEdges(CallGraph &CG,
566+
StringMap<Function *> &SymbolMap) {
567+
// Add profile call edges to the call graph.
568+
std::queue<ContextTrieNode *> NodeQueue;
569+
NodeQueue.push(&RootContext);
570+
while (!NodeQueue.empty()) {
571+
ContextTrieNode *Node = NodeQueue.front();
572+
NodeQueue.pop();
573+
Function *F = SymbolMap.lookup(Node->getFuncName());
574+
for (auto &I : Node->getAllChildContext()) {
575+
ContextTrieNode *ChildNode = &I.second;
576+
NodeQueue.push(ChildNode);
577+
if (F && !F->isDeclaration()) {
578+
Function *Callee = SymbolMap.lookup(ChildNode->getFuncName());
579+
if (Callee && !Callee->isDeclaration())
580+
CG[F]->addCalledFunction(nullptr, CG[Callee]);
581+
}
582+
}
583+
}
584+
}
553585
} // namespace llvm

llvm/lib/Transforms/IPO/SampleProfile.cpp

Lines changed: 135 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -177,6 +177,16 @@ static cl::opt<bool> ProfileTopDownLoad(
177177
"order of call graph during sample profile loading. It only "
178178
"works for new pass manager. "));
179179

180+
static cl::opt<bool> UseProfileIndirectCallEdges(
181+
"use-profile-indirect-call-edges", cl::init(true), cl::Hidden,
182+
cl::desc("Considering indirect call samples from profile when top-down "
183+
"processing functions. Only CSSPGO is supported."));
184+
185+
static cl::opt<bool> UseProfileTopDownOrder(
186+
"use-profile-top-down-order", cl::init(false), cl::Hidden,
187+
cl::desc("Process functions in one SCC in a top-down order "
188+
"based on the input profile."));
189+
180190
static cl::opt<bool> ProfileSizeInline(
181191
"sample-profile-inline-size", cl::Hidden, cl::init(false),
182192
cl::desc("Inline cold call sites in profile loader if it's beneficial "
@@ -458,6 +468,8 @@ class SampleProfileLoader {
458468
uint64_t visitEdge(Edge E, unsigned *NumUnknownEdges, Edge *UnknownEdge);
459469
void buildEdges(Function &F);
460470
std::vector<Function *> buildFunctionOrder(Module &M, CallGraph *CG);
471+
void addCallGraphEdges(CallGraph &CG, const FunctionSamples &Samples);
472+
void replaceCallGraphEdges(CallGraph &CG, StringMap<Function *> &SymbolMap);
461473
bool propagateThroughEdges(Function &F, bool UpdateBlockCount);
462474
void computeDominanceAndLoopInfo(Function &F);
463475
void clearFunctionData();
@@ -2278,6 +2290,45 @@ INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
22782290
INITIALIZE_PASS_END(SampleProfileLoaderLegacyPass, "sample-profile",
22792291
"Sample Profile loader", false, false)
22802292

2293+
// Add inlined profile call edges to the call graph.
2294+
void SampleProfileLoader::addCallGraphEdges(CallGraph &CG,
2295+
const FunctionSamples &Samples) {
2296+
Function *Caller = SymbolMap.lookup(Samples.getFuncName());
2297+
if (!Caller || Caller->isDeclaration())
2298+
return;
2299+
2300+
// Skip non-inlined call edges which are not important since top down inlining
2301+
// for non-CS profile is to get more precise profile matching, not to enable
2302+
// more inlining.
2303+
2304+
for (const auto &CallsiteSamples : Samples.getCallsiteSamples()) {
2305+
for (const auto &InlinedSamples : CallsiteSamples.second) {
2306+
Function *Callee = SymbolMap.lookup(InlinedSamples.first);
2307+
if (Callee && !Callee->isDeclaration())
2308+
CG[Caller]->addCalledFunction(nullptr, CG[Callee]);
2309+
addCallGraphEdges(CG, InlinedSamples.second);
2310+
}
2311+
}
2312+
}
2313+
2314+
// Replace call graph edges with dynamic call edges from the profile.
2315+
void SampleProfileLoader::replaceCallGraphEdges(
2316+
CallGraph &CG, StringMap<Function *> &SymbolMap) {
2317+
// Remove static call edges from the call graph except for the ones from the
2318+
// root which make the call graph connected.
2319+
for (const auto &Node : CG)
2320+
if (Node.second.get() != CG.getExternalCallingNode())
2321+
Node.second->removeAllCalledFunctions();
2322+
2323+
// Add profile call edges to the call graph.
2324+
if (ProfileIsCS) {
2325+
ContextTracker->addCallGraphEdges(CG, SymbolMap);
2326+
} else {
2327+
for (const auto &Samples : Reader->getProfiles())
2328+
addCallGraphEdges(CG, Samples.second);
2329+
}
2330+
}
2331+
22812332
std::vector<Function *>
22822333
SampleProfileLoader::buildFunctionOrder(Module &M, CallGraph *CG) {
22832334
std::vector<Function *> FunctionOrderList;
@@ -2300,16 +2351,97 @@ SampleProfileLoader::buildFunctionOrder(Module &M, CallGraph *CG) {
23002351
}
23012352

23022353
assert(&CG->getModule() == &M);
2354+
2355+
// Add indirect call edges from profile to augment the static call graph.
2356+
// Functions will be processed in a top-down order defined by the static call
2357+
// graph. Adjusting the order by considering indirect call edges from the
2358+
// profile (which don't exist in the static call graph) can enable the
2359+
// inlining of indirect call targets by processing the caller before them.
2360+
// TODO: enable this for non-CS profile and fix the counts returning logic to
2361+
// have a full support for indirect calls.
2362+
if (UseProfileIndirectCallEdges && ProfileIsCS) {
2363+
for (auto &Entry : *CG) {
2364+
const auto *F = Entry.first;
2365+
if (!F || F->isDeclaration() || !F->hasFnAttribute("use-sample-profile"))
2366+
continue;
2367+
auto &AllContexts = ContextTracker->getAllContextSamplesFor(F->getName());
2368+
if (AllContexts.empty())
2369+
continue;
2370+
2371+
for (const auto &BB : *F) {
2372+
for (const auto &I : BB.getInstList()) {
2373+
const auto *CB = dyn_cast<CallBase>(&I);
2374+
if (!CB || !CB->isIndirectCall())
2375+
continue;
2376+
const DebugLoc &DLoc = I.getDebugLoc();
2377+
if (!DLoc)
2378+
continue;
2379+
auto CallSite = FunctionSamples::getCallSiteIdentifier(DLoc);
2380+
for (FunctionSamples *Samples : AllContexts) {
2381+
if (auto CallTargets = Samples->findCallTargetMapAt(CallSite)) {
2382+
for (const auto &Target : CallTargets.get()) {
2383+
Function *Callee = SymbolMap.lookup(Target.first());
2384+
if (Callee && !Callee->isDeclaration())
2385+
Entry.second->addCalledFunction(nullptr, (*CG)[Callee]);
2386+
}
2387+
}
2388+
}
2389+
}
2390+
}
2391+
}
2392+
}
2393+
2394+
// Compute a top-down order the profile which is used to sort functions in
2395+
// one SCC later. The static processing order computed for an SCC may not
2396+
// reflect the call contexts in the context-sensitive profile, thus may cause
2397+
// potential inlining to be overlooked. The function order in one SCC is being
2398+
// adjusted to a top-down order based on the profile to favor more inlining.
2399+
DenseMap<Function *, uint64_t> ProfileOrderMap;
2400+
if (UseProfileTopDownOrder ||
2401+
(ProfileIsCS && !UseProfileTopDownOrder.getNumOccurrences())) {
2402+
// Create a static call graph. The call edges are not important since they
2403+
// will be replaced by dynamic edges from the profile.
2404+
CallGraph ProfileCG(M);
2405+
replaceCallGraphEdges(ProfileCG, SymbolMap);
2406+
scc_iterator<CallGraph *> CGI = scc_begin(&ProfileCG);
2407+
uint64_t I = 0;
2408+
while (!CGI.isAtEnd()) {
2409+
for (CallGraphNode *Node : *CGI) {
2410+
if (auto *F = Node->getFunction())
2411+
ProfileOrderMap[F] = ++I;
2412+
}
2413+
++CGI;
2414+
}
2415+
}
2416+
23032417
scc_iterator<CallGraph *> CGI = scc_begin(CG);
23042418
while (!CGI.isAtEnd()) {
2305-
for (CallGraphNode *node : *CGI) {
2306-
auto F = node->getFunction();
2419+
uint64_t Start = FunctionOrderList.size();
2420+
for (CallGraphNode *Node : *CGI) {
2421+
auto *F = Node->getFunction();
23072422
if (F && !F->isDeclaration() && F->hasFnAttribute("use-sample-profile"))
23082423
FunctionOrderList.push_back(F);
23092424
}
2425+
2426+
// Sort nodes in SCC based on the profile top-down order.
2427+
if (!ProfileOrderMap.empty()) {
2428+
std::stable_sort(FunctionOrderList.begin() + Start,
2429+
FunctionOrderList.end(),
2430+
[&ProfileOrderMap](Function *Left, Function *Right) {
2431+
return ProfileOrderMap[Left] < ProfileOrderMap[Right];
2432+
});
2433+
}
2434+
23102435
++CGI;
23112436
}
23122437

2438+
LLVM_DEBUG({
2439+
dbgs() << "Function processing order:\n";
2440+
for (auto F : reverse(FunctionOrderList)) {
2441+
dbgs() << F->getName() << "\n";
2442+
}
2443+
});
2444+
23132445
std::reverse(FunctionOrderList.begin(), FunctionOrderList.end());
23142446
return FunctionOrderList;
23152447
}
@@ -2461,6 +2593,7 @@ bool SampleProfileLoaderLegacyPass::runOnModule(Module &M) {
24612593
}
24622594

24632595
bool SampleProfileLoader::runOnFunction(Function &F, ModuleAnalysisManager *AM) {
2596+
LLVM_DEBUG(dbgs() << "\n\nProcessing Function " << F.getName() << "\n");
24642597
DILocation2SampleMap.clear();
24652598
// By default the entry count is initialized to -1, which will be treated
24662599
// conservatively by getEntryCount as the same as unknown (None). This is
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
[main:3 @ _Z5funcAi:1 @ _Z8funcLeafi]:1467299:11
2+
0: 6
3+
1: 6
4+
3: 287884
5+
15: 23
6+
[main:3.1 @ _Z5funcBi:1 @ _Z8funcLeafi]:500853:20
7+
0: 15
8+
1: 15
9+
3: 74946
10+
10: 23324
11+
15: 11
12+
[main]:154:0
13+
2: 12
14+
3: 18 _Z5funcAi:11
15+
3.1: 18 _Z5funcBi:19
16+
[external:12 @ main]:154:12
17+
2: 12
18+
3: 10 _Z5funcAi:7
19+
3.1: 10 _Z5funcBi:11
20+
[main:3.1 @ _Z5funcBi]:120:19
21+
0: 19
22+
1: 19 _Z8funcLeafi:20
23+
3: 12
24+
[externalA:17 @ _Z5funcBi]:120:3
25+
0: 3
26+
1: 3
27+
[external:10 @ _Z5funcBi]:120:10
28+
0: 10
29+
1: 10
30+
[main:3 @ _Z5funcAi]:99:11
31+
0: 10
32+
1: 10 _Z8funcLeafi:11
33+
2: 287864 _Z3fibi:315608
34+
3: 24
35+
[main:3 @ _Z5funcAi:2 @ _Z3fibi]:287864:315608
36+
0: 362839
37+
1: 6
38+
3: 287884
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
_Z8funcLeafi:500853:20
2+
0: 15
3+
1: 15
4+
3: 74946
5+
10: 23324
6+
15: 11
7+
main:154:0
8+
2: 12
9+
3: 18 _Z5funcAi:11
10+
3.1: 18 _Z5funcBi:19
11+
main:154:12
12+
2: 12
13+
3: 10 _Z5funcAi:7
14+
3.1: 10 _Z5funcBi:11
15+
_Z5funcBi:120:19
16+
0: 19
17+
1: 19 _Z8funcLeafi:20
18+
3: 12
19+
_Z5funcBi:120:3
20+
0: 3
21+
1: 3
22+
_Z5funcBi:120:10
23+
0: 10
24+
1: 10
25+
_Z5funcAi:99:11
26+
0: 10
27+
1: _Z8funcLeafi:40
28+
0: 6
29+
1: 6
30+
3: 2
31+
15: 23
32+
2: 315608 _Z3fibi:362839
33+
0: 315608
34+
1: 6
35+
3: 287884
36+
3: 24

0 commit comments

Comments
 (0)