Skip to content

Commit 6327d26

Browse files
committed
[CHR] Add a threshold for the code duplication
ControlHeightReduction (CHR) clones the code region to reduce the branches in the hot code path. The number of clones is linear to the depth of the region. Currently it does not have control over the code size increase. We are seeing one ~9000 BB functions get expanded to ~250000 BBs, an 25x increase. This creates a big compile time issue for the downstream optimizations. This patch adds a cap for number of clones for one region. Differential Revision: https://reviews.llvm.org/D138333
1 parent b816b52 commit 6327d26

File tree

3 files changed

+251
-16
lines changed

3 files changed

+251
-16
lines changed

llvm/lib/Passes/PassBuilderPipelines.cpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -675,8 +675,12 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
675675
FPM.addPass(InstCombinePass());
676676
invokePeepholeEPCallbacks(FPM, Level);
677677

678+
// Don't add CHR pass for CSIRInstr build in PostLink as the profile
679+
// is still the same as the PreLink compilation.
678680
if (EnableCHR && Level == OptimizationLevel::O3 && PGOOpt &&
679-
(PGOOpt->Action == PGOOptions::IRUse ||
681+
((PGOOpt->Action == PGOOptions::IRUse &&
682+
(Phase != ThinOrFullLTOPhase::ThinLTOPostLink ||
683+
PGOOpt->CSAction != PGOOptions::CSIRInstr)) ||
680684
PGOOpt->Action == PGOOptions::SampleUse))
681685
FPM.addPass(ControlHeightReductionPass());
682686

llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp

Lines changed: 51 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,9 @@ using namespace llvm;
4747

4848
#define CHR_DEBUG(X) LLVM_DEBUG(X)
4949

50+
static cl::opt<bool> DisableCHR("disable-chr", cl::init(false), cl::Hidden,
51+
cl::desc("Disable CHR for all functions"));
52+
5053
static cl::opt<bool> ForceCHR("force-chr", cl::init(false), cl::Hidden,
5154
cl::desc("Apply CHR for all functions"));
5255

@@ -66,6 +69,10 @@ static cl::opt<std::string> CHRFunctionList(
6669
"chr-function-list", cl::init(""), cl::Hidden,
6770
cl::desc("Specify file to retrieve the list of functions to apply CHR to"));
6871

72+
static cl::opt<unsigned> CHRDupThreshsold(
73+
"chr-dup-threshold", cl::init(3), cl::Hidden,
74+
cl::desc("Max number of duplications by CHR for a region"));
75+
6976
static StringSet<> CHRModules;
7077
static StringSet<> CHRFunctions;
7178

@@ -339,23 +346,27 @@ class CHR {
339346
BasicBlock *EntryBlock,
340347
BasicBlock *NewEntryBlock,
341348
ValueToValueMapTy &VMap);
342-
void fixupBranchesAndSelects(CHRScope *Scope,
343-
BasicBlock *PreEntryBlock,
344-
BranchInst *MergedBR,
345-
uint64_t ProfileCount);
346-
void fixupBranch(Region *R,
347-
CHRScope *Scope,
348-
IRBuilder<> &IRB,
349+
void fixupBranchesAndSelects(CHRScope *Scope, BasicBlock *PreEntryBlock,
350+
BranchInst *MergedBR, uint64_t ProfileCount);
351+
void fixupBranch(Region *R, CHRScope *Scope, IRBuilder<> &IRB,
349352
Value *&MergedCondition, BranchProbability &CHRBranchBias);
350-
void fixupSelect(SelectInst* SI,
351-
CHRScope *Scope,
352-
IRBuilder<> &IRB,
353+
void fixupSelect(SelectInst *SI, CHRScope *Scope, IRBuilder<> &IRB,
353354
Value *&MergedCondition, BranchProbability &CHRBranchBias);
354355
void addToMergedCondition(bool IsTrueBiased, Value *Cond,
355-
Instruction *BranchOrSelect,
356-
CHRScope *Scope,
357-
IRBuilder<> &IRB,
358-
Value *&MergedCondition);
356+
Instruction *BranchOrSelect, CHRScope *Scope,
357+
IRBuilder<> &IRB, Value *&MergedCondition);
358+
unsigned getRegionDuplicationCount(const Region *R) {
359+
unsigned Count = 0;
360+
// Find out how many times region R is cloned. Note that if the parent
361+
// of R is cloned, R is also cloned, but R's clone count is not updated
362+
// from the clone of the parent. We need to accumlate all the counts
363+
// from the ancestors to get the clone count.
364+
while (R) {
365+
Count += DuplicationCount[R];
366+
R = R->getParent();
367+
}
368+
return Count;
369+
}
359370

360371
Function &F;
361372
BlockFrequencyInfo &BFI;
@@ -379,6 +390,8 @@ class CHR {
379390
DenseMap<SelectInst *, BranchProbability> SelectBiasMap;
380391
// All the scopes.
381392
DenseSet<CHRScope *> Scopes;
393+
// This maps records how many times this region is cloned.
394+
DenseMap<const Region *, unsigned> DuplicationCount;
382395
};
383396

384397
} // end anonymous namespace
@@ -396,7 +409,10 @@ raw_ostream &operator<<(raw_ostream &OS, const CHRScope &Scope) {
396409
return OS;
397410
}
398411

399-
static bool shouldApply(Function &F, ProfileSummaryInfo& PSI) {
412+
static bool shouldApply(Function &F, ProfileSummaryInfo &PSI) {
413+
if (DisableCHR)
414+
return false;
415+
400416
if (ForceCHR)
401417
return true;
402418

@@ -1666,6 +1682,26 @@ void CHR::transformScopes(CHRScope *Scope, DenseSet<PHINode *> &TrivialPHIs) {
16661682
CHR_DEBUG(dbgs() << "transformScopes " << *Scope << "\n");
16671683

16681684
assert(Scope->RegInfos.size() >= 1 && "Should have at least one Region");
1685+
1686+
for (RegInfo &RI : Scope->RegInfos) {
1687+
const Region *R = RI.R;
1688+
unsigned Duplication = getRegionDuplicationCount(R);
1689+
dbgs() << "Dup count for R=" << R << " is " << Duplication << "\n";
1690+
if (Duplication >= CHRDupThreshsold) {
1691+
CHR_DEBUG(dbgs() << "Reached the dup threshold of " << Duplication
1692+
<< " for this region");
1693+
ORE.emit([&]() {
1694+
return OptimizationRemarkMissed(DEBUG_TYPE, "DupThresholdReached",
1695+
R->getEntry()->getTerminator())
1696+
<< "Reached the duplication threshold for the region";
1697+
});
1698+
return;
1699+
}
1700+
}
1701+
for (RegInfo &RI : Scope->RegInfos) {
1702+
DuplicationCount[RI.R]++;
1703+
}
1704+
16691705
Region *FirstRegion = Scope->RegInfos[0].R;
16701706
BasicBlock *EntryBlock = FirstRegion->getEntry();
16711707
Region *LastRegion = Scope->RegInfos[Scope->RegInfos.size() - 1].R;
Lines changed: 195 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,195 @@
1+
; Test case for capping the cloning in CHR.
2+
; RUN: opt < %s -passes='require<profile-summary>,function(chr)' -chr-dup-threshold=2 -S | FileCheck %s
3+
4+
; c sources for the test case.
5+
; extern void foo(int);
6+
; __attribute__((noinline)) void goo(int r, int s, int t) {
7+
; if ((r & 2) != 0) {
8+
; if ((s & 2) != 0) {
9+
; if ((t & 2) != 0) {
10+
; foo(111);
11+
; }
12+
; if ((t & 4) != 0) {
13+
; foo(112);
14+
; }
15+
; }
16+
; if ((s & 4) != 0) {
17+
; if ((t & 2) != 0) {
18+
; foo(121);
19+
; }
20+
; if ((t & 4) != 0) {
21+
; foo(122);
22+
; }
23+
; }
24+
; }
25+
; if ((r & 4) != 0) {
26+
; if ((s & 2) != 0) {
27+
; if ((t & 2) != 0) {
28+
; foo(211);
29+
; }
30+
; if ((t & 4) != 0) {
31+
; foo(212);
32+
; }
33+
; }
34+
; if ((s & 4) != 0) {
35+
; if ((t & 2) != 0) {
36+
; foo(221);
37+
; }
38+
; if ((t & 4) != 0) {
39+
; foo(222);
40+
; }
41+
; }
42+
; }
43+
; }
44+
;
45+
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
46+
target triple = "x86_64-unknown-linux-gnu"
47+
48+
define dso_local void @goo(i32 noundef %r, i32 noundef %s, i32 noundef %t) !prof !34 {
49+
entry:
50+
%and = and i32 %r, 2
51+
%cmp.not = icmp eq i32 %and, 0
52+
br i1 %cmp.not, label %if.end24, label %if.then, !prof !35
53+
54+
if.then:
55+
%and1 = and i32 %s, 2
56+
%cmp2.not = icmp eq i32 %and1, 0
57+
br i1 %cmp2.not, label %if.end11, label %if.then3, !prof !35
58+
59+
if.then3:
60+
%and4 = and i32 %t, 2
61+
%cmp5.not = icmp eq i32 %and4, 0
62+
br i1 %cmp5.not, label %if.end, label %if.then6, !prof !35
63+
64+
if.then6:
65+
tail call void @foo(i32 noundef 111)
66+
br label %if.end
67+
68+
if.end:
69+
%and7 = and i32 %t, 4
70+
%cmp8.not = icmp eq i32 %and7, 0
71+
br i1 %cmp8.not, label %if.end11, label %if.then9, !prof !35
72+
73+
if.then9:
74+
tail call void @foo(i32 noundef 112)
75+
br label %if.end11
76+
77+
if.end11:
78+
%and12 = and i32 %s, 4
79+
%cmp13.not = icmp eq i32 %and12, 0
80+
br i1 %cmp13.not, label %if.end24, label %if.then14, !prof !35
81+
82+
if.then14:
83+
%and15 = and i32 %t, 2
84+
%cmp16.not = icmp eq i32 %and15, 0
85+
br i1 %cmp16.not, label %if.end18, label %if.then17, !prof !35
86+
87+
if.then17:
88+
tail call void @foo(i32 noundef 121)
89+
br label %if.end18
90+
91+
if.end18:
92+
%and19 = and i32 %t, 4
93+
%cmp20.not = icmp eq i32 %and19, 0
94+
br i1 %cmp20.not, label %if.end24, label %if.then21, !prof !35
95+
96+
if.then21:
97+
tail call void @foo(i32 noundef 122)
98+
br label %if.end24
99+
100+
if.end24:
101+
%and25 = and i32 %r, 4
102+
%cmp26.not = icmp eq i32 %and25, 0
103+
br i1 %cmp26.not, label %if.end52, label %if.then27, !prof !35
104+
105+
if.then27:
106+
%and28 = and i32 %s, 2
107+
%cmp29.not = icmp eq i32 %and28, 0
108+
br i1 %cmp29.not, label %if.end39, label %if.then30, !prof !35
109+
110+
if.then30:
111+
%and31 = and i32 %t, 2
112+
%cmp32.not = icmp eq i32 %and31, 0
113+
br i1 %cmp32.not, label %if.end34, label %if.then33, !prof !35
114+
115+
if.then33:
116+
tail call void @foo(i32 noundef 211)
117+
br label %if.end34
118+
119+
if.end34:
120+
%and35 = and i32 %t, 4
121+
%cmp36.not = icmp eq i32 %and35, 0
122+
br i1 %cmp36.not, label %if.end39, label %if.then37, !prof !35
123+
124+
if.then37:
125+
tail call void @foo(i32 noundef 212)
126+
br label %if.end39
127+
128+
if.end39:
129+
%and40 = and i32 %s, 4
130+
%cmp41.not = icmp eq i32 %and40, 0
131+
br i1 %cmp41.not, label %if.end52, label %if.then42, !prof !35
132+
133+
if.then42:
134+
%and43 = and i32 %t, 2
135+
%cmp44.not = icmp eq i32 %and43, 0
136+
br i1 %cmp44.not, label %if.end46, label %if.then45, !prof !35
137+
138+
if.then45:
139+
tail call void @foo(i32 noundef 221)
140+
br label %if.end46
141+
142+
if.end46:
143+
%and47 = and i32 %t, 4
144+
%cmp48.not = icmp eq i32 %and47, 0
145+
br i1 %cmp48.not, label %if.end52, label %if.then49, !prof !35
146+
147+
if.then49:
148+
tail call void @foo(i32 noundef 222)
149+
br label %if.end52
150+
151+
if.end52:
152+
ret void
153+
}
154+
155+
; CHECK-LABEL: goo
156+
; CHECK-COUNT-3: {{.*}}.split:
157+
; CHECK-NOT: {{.*}}.split:
158+
159+
declare void @foo(i32 noundef)
160+
161+
!llvm.module.flags = !{!4}
162+
163+
!4 = !{i32 1, !"ProfileSummary", !5}
164+
!5 = !{!6, !7, !8, !9, !10, !11, !12, !13, !14, !15}
165+
!6 = !{!"ProfileFormat", !"InstrProf"}
166+
!7 = !{!"TotalCount", i64 2400001}
167+
!8 = !{!"MaxCount", i64 800000}
168+
!9 = !{!"MaxInternalCount", i64 100000}
169+
!10 = !{!"MaxFunctionCount", i64 800000}
170+
!11 = !{!"NumCounts", i64 19}
171+
!12 = !{!"NumFunctions", i64 4}
172+
!13 = !{!"IsPartialProfile", i64 0}
173+
!14 = !{!"PartialProfileRatio", double 0.000000e+00}
174+
!15 = !{!"DetailedSummary", !16}
175+
!16 = !{!17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !28, !29, !30, !31, !32}
176+
!17 = !{i32 10000, i64 800000, i32 1}
177+
!18 = !{i32 100000, i64 800000, i32 1}
178+
!19 = !{i32 200000, i64 800000, i32 1}
179+
!20 = !{i32 300000, i64 800000, i32 1}
180+
!21 = !{i32 400000, i64 100000, i32 17}
181+
!22 = !{i32 500000, i64 100000, i32 17}
182+
!23 = !{i32 600000, i64 100000, i32 17}
183+
!24 = !{i32 700000, i64 100000, i32 17}
184+
!25 = !{i32 800000, i64 100000, i32 17}
185+
!26 = !{i32 900000, i64 100000, i32 17}
186+
!27 = !{i32 950000, i64 100000, i32 17}
187+
!28 = !{i32 990000, i64 100000, i32 17}
188+
!29 = !{i32 999000, i64 100000, i32 17}
189+
!30 = !{i32 999900, i64 100000, i32 17}
190+
!31 = !{i32 999990, i64 100000, i32 17}
191+
!32 = !{i32 999999, i64 100000, i32 17}
192+
!34 = !{!"function_entry_count", i64 100000}
193+
!35 = !{!"branch_weights", i32 0, i32 100000}
194+
!36 = !{!"function_entry_count", i64 1}
195+
!37 = !{!"branch_weights", i32 100000, i32 1}

0 commit comments

Comments
 (0)