Skip to content

Commit fe22a19

Browse files
author
Shimin Cui
committed
[PPC] Set minimum of largest number of comparisons to use bit test for switch
1 parent c7df883 commit fe22a19

File tree

6 files changed

+256
-18
lines changed

6 files changed

+256
-18
lines changed

llvm/include/llvm/CodeGen/BasicTTIImpl.h

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -594,12 +594,13 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
594594

595595
// Check if suitable for a bit test
596596
if (N <= DL.getIndexSizeInBits(0u)) {
597-
SmallPtrSet<const BasicBlock *, 4> Dests;
598-
for (auto I : SI.cases())
599-
Dests.insert(I.getCaseSuccessor());
597+
DenseMap<const BasicBlock *, unsigned int> DestMap;
598+
for (auto I : SI.cases()) {
599+
const BasicBlock *BB = I.getCaseSuccessor();
600+
DestMap[BB] = DestMap.count(BB) ? (DestMap.count(BB) + 1) : 1;
601+
}
600602

601-
if (TLI->isSuitableForBitTests(Dests.size(), N, MinCaseVal, MaxCaseVal,
602-
DL))
603+
if (TLI->isSuitableForBitTests(DestMap, MinCaseVal, MaxCaseVal, DL))
603604
return 1;
604605
}
605606

llvm/include/llvm/CodeGen/TargetLowering.h

Lines changed: 24 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1441,9 +1441,10 @@ class LLVM_ABI TargetLoweringBase {
14411441
/// \p High as its lowest and highest case values, and expects \p NumCmps
14421442
/// case value comparisons. Check if the number of destinations, comparison
14431443
/// metric, and range are all suitable.
1444-
bool isSuitableForBitTests(unsigned NumDests, unsigned NumCmps,
1445-
const APInt &Low, const APInt &High,
1446-
const DataLayout &DL) const {
1444+
bool
1445+
isSuitableForBitTests(DenseMap<const BasicBlock *, unsigned int> DestCmps,
1446+
const APInt &Low, const APInt &High,
1447+
const DataLayout &DL) const {
14471448
// FIXME: I don't think NumCmps is the correct metric: a single case and a
14481449
// range of cases both require only one branch to lower. Just looking at the
14491450
// number of clusters and destinations should be enough to decide whether to
@@ -1454,6 +1455,20 @@ class LLVM_ABI TargetLoweringBase {
14541455
if (!rangeFitsInWord(Low, High, DL))
14551456
return false;
14561457

1458+
unsigned NumDests = DestCmps.size();
1459+
unsigned NumCmps = 0;
1460+
unsigned int MaxBitTestEntry = 0;
1461+
for (auto &DestCmp : DestCmps) {
1462+
NumCmps += DestCmp.second;
1463+
if (DestCmp.second > MaxBitTestEntry)
1464+
MaxBitTestEntry = DestCmp.second;
1465+
}
1466+
1467+
// Comparisons might be cheaper for small number of comparisons, which can
1468+
// be Arch Target specific.
1469+
if (MaxBitTestEntry < getMinimumBitTestCmps())
1470+
return false;
1471+
14571472
// Decide whether it's profitable to lower this range with bit tests. Each
14581473
// destination requires a bit test and branch, and there is an overall range
14591474
// check branch. For a small number of clusters, separate comparisons might
@@ -2063,6 +2078,9 @@ class LLVM_ABI TargetLoweringBase {
20632078

20642079
virtual bool isJumpTableRelative() const;
20652080

2081+
/// Retuen the minimum of largest number of comparisons in BitTest.
2082+
virtual unsigned getMinimumBitTestCmps() const;
2083+
20662084
/// If a physical register, this specifies the register that
20672085
/// llvm.savestack/llvm.restorestack should save and restore.
20682086
Register getStackPointerRegisterToSaveRestore() const {
@@ -2579,6 +2597,9 @@ class LLVM_ABI TargetLoweringBase {
25792597
/// Set to zero to generate unlimited jump tables.
25802598
void setMaximumJumpTableSize(unsigned);
25812599

2600+
/// Set the minimum of largest of number of comparisons to generate BitTest.
2601+
void setMinimumBitTestCmps(unsigned Val);
2602+
25822603
/// If set to a physical register, this specifies the register that
25832604
/// llvm.savestack/llvm.restorestack should save and restore.
25842605
void setStackPointerRegisterToSaveRestore(Register R) {

llvm/lib/CodeGen/SwitchLoweringUtils.cpp

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -206,12 +206,16 @@ bool SwitchCG::SwitchLowering::buildJumpTable(const CaseClusterVector &Clusters,
206206
for (unsigned I = First; I <= Last; ++I)
207207
JTProbs[Clusters[I].MBB] = BranchProbability::getZero();
208208

209+
DenseMap<const BasicBlock *, unsigned int> DestMap;
209210
for (unsigned I = First; I <= Last; ++I) {
210211
assert(Clusters[I].Kind == CC_Range);
211212
Prob += Clusters[I].Prob;
212213
const APInt &Low = Clusters[I].Low->getValue();
213214
const APInt &High = Clusters[I].High->getValue();
214-
NumCmps += (Low == High) ? 1 : 2;
215+
unsigned int NumCmp = (Low == High) ? 1 : 2;
216+
const BasicBlock *BB = Clusters[I].MBB->getBasicBlock();
217+
DestMap[BB] = DestMap.count(BB) ? (DestMap[BB] + NumCmp) : NumCmp;
218+
215219
if (I != First) {
216220
// Fill the gap between this and the previous cluster.
217221
const APInt &PreviousHigh = Clusters[I - 1].High->getValue();
@@ -226,9 +230,7 @@ bool SwitchCG::SwitchLowering::buildJumpTable(const CaseClusterVector &Clusters,
226230
JTProbs[Clusters[I].MBB] += Clusters[I].Prob;
227231
}
228232

229-
unsigned NumDests = JTProbs.size();
230-
if (TLI->isSuitableForBitTests(NumDests, NumCmps,
231-
Clusters[First].Low->getValue(),
233+
if (TLI->isSuitableForBitTests(DestMap, Clusters[First].Low->getValue(),
232234
Clusters[Last].High->getValue(), *DL)) {
233235
// Clusters[First..Last] should be lowered as bit tests instead.
234236
return false;
@@ -372,20 +374,19 @@ bool SwitchCG::SwitchLowering::buildBitTests(CaseClusterVector &Clusters,
372374
if (First == Last)
373375
return false;
374376

375-
BitVector Dests(FuncInfo.MF->getNumBlockIDs());
376-
unsigned NumCmps = 0;
377+
DenseMap<const BasicBlock *, unsigned int> DestMap;
377378
for (int64_t I = First; I <= Last; ++I) {
378379
assert(Clusters[I].Kind == CC_Range);
379-
Dests.set(Clusters[I].MBB->getNumber());
380-
NumCmps += (Clusters[I].Low == Clusters[I].High) ? 1 : 2;
380+
unsigned NumCmp = (Clusters[I].Low == Clusters[I].High) ? 1 : 2;
381+
const BasicBlock *BB = Clusters[I].MBB->getBasicBlock();
382+
DestMap[BB] = DestMap.count(BB) ? (DestMap[BB] + NumCmp) : NumCmp;
381383
}
382-
unsigned NumDests = Dests.count();
383384

384385
APInt Low = Clusters[First].Low->getValue();
385386
APInt High = Clusters[Last].High->getValue();
386387
assert(Low.slt(High));
387388

388-
if (!TLI->isSuitableForBitTests(NumDests, NumCmps, Low, High, *DL))
389+
if (!TLI->isSuitableForBitTests(DestMap, Low, High, *DL))
389390
return false;
390391

391392
APInt LowBound;

llvm/lib/CodeGen/TargetLoweringBase.cpp

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
//===----------------------------------------------------------------------===//
1212

1313
#include "llvm/ADT/BitVector.h"
14+
#include "llvm/ADT/DenseMap.h"
1415
#include "llvm/ADT/STLExtras.h"
1516
#include "llvm/ADT/SmallVector.h"
1617
#include "llvm/ADT/StringExtras.h"
@@ -90,6 +91,11 @@ static cl::opt<unsigned> OptsizeJumpTableDensity(
9091
cl::desc("Minimum density for building a jump table in "
9192
"an optsize function"));
9293

94+
static cl::opt<unsigned>
95+
MinimumBitTestCmps("min-bit-test-cmps", cl::init(2), cl::Hidden,
96+
cl::desc("Set minimum of largest number of comparisons "
97+
"to use bit test for switch."));
98+
9399
// FIXME: This option is only to test if the strict fp operation processed
94100
// correctly by preventing mutating strict fp operation to normal fp operation
95101
// during development. When the backend supports strict float operation, this
@@ -2120,6 +2126,14 @@ bool TargetLoweringBase::isJumpTableRelative() const {
21202126
return getTargetMachine().isPositionIndependent();
21212127
}
21222128

2129+
unsigned TargetLoweringBase::getMinimumBitTestCmps() const {
2130+
return MinimumBitTestCmps;
2131+
}
2132+
2133+
void TargetLoweringBase::setMinimumBitTestCmps(unsigned Val) {
2134+
MinimumBitTestCmps = Val;
2135+
}
2136+
21232137
Align TargetLoweringBase::getPrefLoopAlignment(MachineLoop *ML) const {
21242138
if (TM.Options.LoopAlignment)
21252139
return Align(TM.Options.LoopAlignment);

llvm/lib/Target/PowerPC/PPCISelLowering.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,11 @@ static cl::opt<unsigned> PPCMinimumJumpTableEntries(
138138
"ppc-min-jump-table-entries", cl::init(64), cl::Hidden,
139139
cl::desc("Set minimum number of entries to use a jump table on PPC"));
140140

141+
static cl::opt<unsigned> PPCMinimumBitTestCmps(
142+
"ppc-min-bit-test-cmps", cl::init(3), cl::Hidden,
143+
cl::desc("Set minimum of largest number of comparisons to use bit test for "
144+
"switch on PPC."));
145+
141146
static cl::opt<unsigned> PPCGatherAllAliasesMaxDepth(
142147
"ppc-gather-alias-max-depth", cl::init(18), cl::Hidden,
143148
cl::desc("max depth when checking alias info in GatherAllAliases()"));
@@ -1438,6 +1443,9 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
14381443
// Re-evaluate this value on future HWs that can do better with mtctr.
14391444
setMinimumJumpTableEntries(PPCMinimumJumpTableEntries);
14401445

1446+
// The default minimum of largest number in a BitTest cluster is 3.
1447+
setMinimumBitTestCmps(PPCMinimumBitTestCmps);
1448+
14411449
setMinFunctionAlignment(Align(4));
14421450
setMinCmpXchgSizeInBits(Subtarget.hasPartwordAtomics() ? 8 : 32);
14431451

Lines changed: 193 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,193 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc -verify-machineinstrs < %s -O3 -mcpu=ppc -mtriple powerpc-ibm-aix \
3+
; RUN: -ppc-asm-full-reg-names | FileCheck %s
4+
5+
define i32 @foo(i32 noundef signext %x) {
6+
; CHECK-LABEL: foo:
7+
; CHECK: # %bb.0: # %entry
8+
; CHECK-NEXT: mflr r0
9+
; CHECK-NEXT: stwu r1, -64(r1)
10+
; CHECK-NEXT: stw r0, 72(r1)
11+
; CHECK-NEXT: cmpwi r3, 8
12+
; CHECK-NEXT: stw r31, 60(r1) # 4-byte Folded Spill
13+
; CHECK-NEXT: mr r31, r3
14+
; CHECK-NEXT: li r3, 0
15+
; CHECK-NEXT: ble cr0, L..BB0_4
16+
; CHECK-NEXT: # %bb.1: # %entry
17+
; CHECK-NEXT: cmpwi r31, 11
18+
; CHECK-NEXT: bge cr0, L..BB0_7
19+
; CHECK-NEXT: # %bb.2: # %entry
20+
; CHECK-NEXT: cmplwi r31, 9
21+
; CHECK-NEXT: beq cr0, L..BB0_9
22+
; CHECK-NEXT: # %bb.3: # %entry
23+
; CHECK-NEXT: cmplwi r31, 10
24+
; CHECK-NEXT: beq cr0, L..BB0_11
25+
; CHECK-NEXT: b L..BB0_13
26+
; CHECK-NEXT: L..BB0_4: # %entry
27+
; CHECK-NEXT: cmplwi r31, 4
28+
; CHECK-NEXT: beq cr0, L..BB0_12
29+
; CHECK-NEXT: # %bb.5: # %entry
30+
; CHECK-NEXT: cmplwi r31, 7
31+
; CHECK-NEXT: beq cr0, L..BB0_11
32+
; CHECK-NEXT: # %bb.6: # %entry
33+
; CHECK-NEXT: cmplwi r31, 8
34+
; CHECK-NEXT: beq cr0, L..BB0_10
35+
; CHECK-NEXT: b L..BB0_13
36+
; CHECK-NEXT: L..BB0_7: # %entry
37+
; CHECK-NEXT: beq cr0, L..BB0_10
38+
; CHECK-NEXT: # %bb.8: # %entry
39+
; CHECK-NEXT: cmplwi r31, 12
40+
; CHECK-NEXT: bne cr0, L..BB0_13
41+
; CHECK-NEXT: L..BB0_9: # %sw.bb2
42+
; CHECK-NEXT: mr r3, r31
43+
; CHECK-NEXT: bl .foo3[PR]
44+
; CHECK-NEXT: nop
45+
; CHECK-NEXT: mr r3, r31
46+
; CHECK-NEXT: b L..BB0_13
47+
; CHECK-NEXT: L..BB0_10: # %sw.bb1
48+
; CHECK-NEXT: mr r3, r31
49+
; CHECK-NEXT: bl .foo2[PR]
50+
; CHECK-NEXT: nop
51+
; CHECK-NEXT: mr r3, r31
52+
; CHECK-NEXT: b L..BB0_13
53+
; CHECK-NEXT: L..BB0_11: # %sw.bb
54+
; CHECK-NEXT: mr r3, r31
55+
; CHECK-NEXT: bl .foo1[PR]
56+
; CHECK-NEXT: nop
57+
; CHECK-NEXT: mr r3, r31
58+
; CHECK-NEXT: b L..BB0_13
59+
; CHECK-NEXT: L..BB0_12: # %sw.bb3
60+
; CHECK-NEXT: li r3, 4
61+
; CHECK-NEXT: bl .foo4[PR]
62+
; CHECK-NEXT: nop
63+
; CHECK-NEXT: li r3, 4
64+
; CHECK-NEXT: L..BB0_13: # %return
65+
; CHECK-NEXT: lwz r31, 60(r1) # 4-byte Folded Reload
66+
; CHECK-NEXT: addi r1, r1, 64
67+
; CHECK-NEXT: lwz r0, 8(r1)
68+
; CHECK-NEXT: mtlr r0
69+
; CHECK-NEXT: blr
70+
entry:
71+
switch i32 %x, label %return [
72+
i32 7, label %sw.bb
73+
i32 10, label %sw.bb
74+
i32 8, label %sw.bb1
75+
i32 11, label %sw.bb1
76+
i32 9, label %sw.bb2
77+
i32 12, label %sw.bb2
78+
i32 4, label %sw.bb3
79+
]
80+
81+
sw.bb: ; preds = %entry, %entry
82+
tail call void @foo1(i32 noundef signext %x)
83+
br label %return
84+
85+
sw.bb1: ; preds = %entry, %entry
86+
tail call void @foo2(i32 noundef signext %x)
87+
br label %return
88+
89+
sw.bb2: ; preds = %entry, %entry
90+
tail call void @foo3(i32 noundef signext %x)
91+
br label %return
92+
93+
sw.bb3: ; preds = %entry
94+
tail call void @foo4(i32 noundef signext 4)
95+
br label %return
96+
97+
return: ; preds = %sw.bb, %sw.bb1, %sw.bb2, %sw.bb3, %entry
98+
%retval.0 = phi i32 [ 0, %entry ], [ 4, %sw.bb3 ], [ %x, %sw.bb2 ], [ %x, %sw.bb1 ], [ %x, %sw.bb ]
99+
ret i32 %retval.0
100+
}
101+
102+
define i32 @goo(i32 noundef signext %x) {
103+
; CHECK-LABEL: goo:
104+
; CHECK: # %bb.0: # %entry
105+
; CHECK-NEXT: mflr r0
106+
; CHECK-NEXT: stwu r1, -64(r1)
107+
; CHECK-NEXT: stw r0, 72(r1)
108+
; CHECK-NEXT: cmplwi r3, 12
109+
; CHECK-NEXT: stw r31, 60(r1) # 4-byte Folded Spill
110+
; CHECK-NEXT: mr r31, r3
111+
; CHECK-NEXT: bgt cr0, L..BB1_7
112+
; CHECK-NEXT: # %bb.1: # %entry
113+
; CHECK-NEXT: li r3, 1
114+
; CHECK-NEXT: slw r3, r3, r31
115+
; CHECK-NEXT: andi. r4, r3, 5632
116+
; CHECK-NEXT: bne cr0, L..BB1_4
117+
; CHECK-NEXT: # %bb.2: # %entry
118+
; CHECK-NEXT: andi. r3, r3, 2304
119+
; CHECK-NEXT: beq cr0, L..BB1_5
120+
; CHECK-NEXT: # %bb.3: # %sw.bb1
121+
; CHECK-NEXT: mr r3, r31
122+
; CHECK-NEXT: bl .foo2[PR]
123+
; CHECK-NEXT: nop
124+
; CHECK-NEXT: b L..BB1_9
125+
; CHECK-NEXT: L..BB1_4: # %sw.bb2
126+
; CHECK-NEXT: mr r3, r31
127+
; CHECK-NEXT: bl .foo3[PR]
128+
; CHECK-NEXT: nop
129+
; CHECK-NEXT: b L..BB1_9
130+
; CHECK-NEXT: L..BB1_5: # %entry
131+
; CHECK-NEXT: cmplwi r31, 7
132+
; CHECK-NEXT: bne cr0, L..BB1_7
133+
; CHECK-NEXT: # %bb.6: # %sw.bb
134+
; CHECK-NEXT: li r3, 7
135+
; CHECK-NEXT: li r31, 7
136+
; CHECK-NEXT: bl .foo1[PR]
137+
; CHECK-NEXT: nop
138+
; CHECK-NEXT: b L..BB1_9
139+
; CHECK-NEXT: L..BB1_7: # %entry
140+
; CHECK-NEXT: cmplwi r31, 4
141+
; CHECK-NEXT: li r31, 0
142+
; CHECK-NEXT: bne cr0, L..BB1_9
143+
; CHECK-NEXT: # %bb.8: # %sw.bb3
144+
; CHECK-NEXT: li r3, 4
145+
; CHECK-NEXT: li r31, 4
146+
; CHECK-NEXT: bl .foo4[PR]
147+
; CHECK-NEXT: nop
148+
; CHECK-NEXT: L..BB1_9: # %return
149+
; CHECK-NEXT: mr r3, r31
150+
; CHECK-NEXT: lwz r31, 60(r1) # 4-byte Folded Reload
151+
; CHECK-NEXT: addi r1, r1, 64
152+
; CHECK-NEXT: lwz r0, 8(r1)
153+
; CHECK-NEXT: mtlr r0
154+
; CHECK-NEXT: blr
155+
entry:
156+
switch i32 %x, label %return [
157+
i32 7, label %sw.bb
158+
i32 8, label %sw.bb1
159+
i32 11, label %sw.bb1
160+
i32 9, label %sw.bb2
161+
i32 10, label %sw.bb2
162+
i32 12, label %sw.bb2
163+
i32 4, label %sw.bb3
164+
]
165+
166+
sw.bb: ; preds = %entry
167+
tail call void @foo1(i32 noundef signext 7)
168+
br label %return
169+
170+
sw.bb1: ; preds = %entry, %entry
171+
tail call void @foo2(i32 noundef signext %x)
172+
br label %return
173+
174+
sw.bb2: ; preds = %entry, %entry, %entry
175+
tail call void @foo3(i32 noundef signext %x)
176+
br label %return
177+
178+
sw.bb3: ; preds = %entry
179+
tail call void @foo4(i32 noundef signext 4)
180+
br label %return
181+
182+
return: ; preds = %sw.bb, %sw.bb1, %sw.bb2, %sw.bb3, %entry
183+
%retval.0 = phi i32 [ 0, %entry ], [ 4, %sw.bb3 ], [ %x, %sw.bb2 ], [ %x, %sw.bb1 ], [ 7, %sw.bb ]
184+
ret i32 %retval.0
185+
}
186+
187+
declare void @foo1(i32 noundef signext)
188+
189+
declare void @foo2(i32 noundef signext)
190+
191+
declare void @foo3(i32 noundef signext)
192+
193+
declare void @foo4(i32 noundef signext)

0 commit comments

Comments
 (0)