Skip to content

Commit 151b2b4

Browse files
committed
[CGP] Split off switch cases with 0 and another power-of-2 conts.
Add a new transform to split off switch cases with 0 and another power-of-2 constant to an AND + ICMP + BR. This removes a branch which can be highly profitable, especially when the switch controls exiting the loop. Alive2 proof showing that a power-of-2 constant is required: https://alive2.llvm.org/ce/z/VIMMNB.
1 parent 9a4a6d2 commit 151b2b4

File tree

3 files changed

+121
-49
lines changed

3 files changed

+121
-49
lines changed

llvm/lib/CodeGen/CodeGenPrepare.cpp

Lines changed: 75 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -441,7 +441,8 @@ class CodeGenPrepare {
441441
bool optimizeShuffleVectorInst(ShuffleVectorInst *SVI);
442442
bool optimizeSwitchType(SwitchInst *SI);
443443
bool optimizeSwitchPhiConstants(SwitchInst *SI);
444-
bool optimizeSwitchInst(SwitchInst *SI);
444+
bool optimizeSwitchPow2Constant(SwitchInst *SI, ModifyDT &ModifiedDT);
445+
bool optimizeSwitchInst(SwitchInst *SI, ModifyDT &ModifiedDT);
445446
bool optimizeExtractElementInst(Instruction *Inst);
446447
bool dupRetToEnableTailCallOpts(BasicBlock *BB, ModifyDT &ModifiedDT);
447448
bool fixupDbgValue(Instruction *I);
@@ -7888,9 +7889,80 @@ bool CodeGenPrepare::optimizeSwitchPhiConstants(SwitchInst *SI) {
78887889
return Changed;
78897890
}
78907891

7891-
bool CodeGenPrepare::optimizeSwitchInst(SwitchInst *SI) {
7892+
bool CodeGenPrepare::optimizeSwitchPow2Constant(SwitchInst *SI,
7893+
ModifyDT &ModifiedDT) {
7894+
// Try to split off and combine a case with 0 and a power-of-2 together to a
7895+
// single check and branch.
7896+
7897+
// Bail out if there either aren't enough cases to fold or too many.
7898+
if (SI->getNumCases() < 2 || SI->getNumCases() >= 8)
7899+
return false;
7900+
7901+
// Collect cases and sort them so that power-of-2s come first in ascending
7902+
// order.
7903+
SmallVector<std::pair<APInt, BasicBlock *>> Cases;
7904+
for (auto &C : SI->cases())
7905+
Cases.emplace_back(C.getCaseValue()->getValue(), C.getCaseSuccessor());
7906+
sort(Cases, [](const auto &A, const auto &B) {
7907+
const APInt &AV = A.first;
7908+
const APInt &BV = B.first;
7909+
if (AV.isPowerOf2() != BV.isPowerOf2())
7910+
return AV.isPowerOf2();
7911+
return AV.ult(BV);
7912+
});
7913+
7914+
// Bail out if we don't have a single power-of-2 constant, followed by zero
7915+
// with a common destination.
7916+
// TODO: could support multiple power-of-2s by just picking one.
7917+
BasicBlock *Dst = Cases[0].second;
7918+
APInt Pow2 = Cases[0].first;
7919+
if (Dst != Cases[1].second || !Cases[1].first.isZero() || !Pow2.isPowerOf2())
7920+
return false;
7921+
7922+
// Limit the transform to switches leaving loops for now.
7923+
if (LI->getLoopFor(Dst) == LI->getLoopFor(SI->getParent()))
7924+
return false;
7925+
7926+
// Check if there are case values before/after the power-of-2 that are
7927+
// consecutive. In that case, they can be generated as range-checks.
7928+
sort(Cases,
7929+
[](const auto &A, const auto &B) { return A.first.ult(B.first); });
7930+
auto Idx = find_if(Cases, [Pow2](const auto &C) { return C.first == Pow2; });
7931+
bool Increasing = Idx + 1 != Cases.end() && (Idx + 1)->second == Dst &&
7932+
Idx->first + 1 == (Idx + 1)->first;
7933+
bool Decreasing = Idx != Cases.begin() && (Idx - 1)->second == Dst &&
7934+
Idx->first - 1 == (Idx - 1)->first;
7935+
if (Increasing || Decreasing)
7936+
return false;
7937+
7938+
auto *OldBB = SI->getParent();
7939+
auto *NewBB = OldBB->splitBasicBlock(OldBB->getTerminator()->getIterator());
7940+
OldBB->getTerminator()->eraseFromParent();
7941+
IRBuilder<> B(OldBB);
7942+
auto *Pow2CI = ConstantInt::get(OldBB->getContext(), Pow2);
7943+
auto *And = B.CreateAnd(
7944+
SI->getCondition(),
7945+
B.CreateNeg(B.CreateAdd(Pow2CI, B.getIntN(Pow2.getBitWidth(), 1))));
7946+
auto *C = B.CreateICmpEQ(And, B.getIntN(Pow2.getBitWidth(), 0));
7947+
B.CreateCondBr(C, Dst, SI->getParent());
7948+
SI->removeCase(
7949+
SI->findCaseValue(ConstantInt::get(OldBB->getContext(), Cases[0].first)));
7950+
SI->removeCase(SI->findCaseValue(Pow2CI));
7951+
7952+
for (auto &P : Dst->phis()) {
7953+
P.addIncoming(P.getIncomingValueForBlock(NewBB), OldBB);
7954+
P.removeIncomingValue(NewBB);
7955+
P.removeIncomingValue(NewBB);
7956+
}
7957+
7958+
ModifiedDT = ModifyDT::ModifyBBDT;
7959+
return true;
7960+
}
7961+
7962+
bool CodeGenPrepare::optimizeSwitchInst(SwitchInst *SI, ModifyDT &ModifiedDT) {
78927963
bool Changed = optimizeSwitchType(SI);
78937964
Changed |= optimizeSwitchPhiConstants(SI);
7965+
Changed |= optimizeSwitchPow2Constant(SI, ModifiedDT);
78947966
return Changed;
78957967
}
78967968

@@ -8815,7 +8887,7 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, ModifyDT &ModifiedDT) {
88158887
case Instruction::ShuffleVector:
88168888
return optimizeShuffleVectorInst(cast<ShuffleVectorInst>(I));
88178889
case Instruction::Switch:
8818-
return optimizeSwitchInst(cast<SwitchInst>(I));
8890+
return optimizeSwitchInst(cast<SwitchInst>(I), ModifiedDT);
88198891
case Instruction::ExtractElement:
88208892
return optimizeExtractElementInst(cast<ExtractElementInst>(I));
88218893
case Instruction::Br:

llvm/test/CodeGen/AArch64/switch-cases-to-branch-and.ll

Lines changed: 12 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -4,30 +4,24 @@
44
define i32 @switch_with_matching_dests_0_and_pow2_3_cases(i8 %v) {
55
; CHECK-LABEL: switch_with_matching_dests_0_and_pow2_3_cases:
66
; CHECK: ; %bb.0: ; %entry
7-
; CHECK-NEXT: mov w9, #100 ; =0x64
8-
; CHECK-NEXT: mov w8, #20 ; =0x14
7+
; CHECK-NEXT: mov w8, #100 ; =0x64
8+
; CHECK-NEXT: mov w9, #223 ; =0xdf
99
; CHECK-NEXT: LBB0_1: ; %loop.header
1010
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
11-
; CHECK-NEXT: ands w10, w0, #0xff
12-
; CHECK-NEXT: b.eq LBB0_6
13-
; CHECK-NEXT: ; %bb.2: ; %loop.header
14-
; CHECK-NEXT: ; in Loop: Header=BB0_1 Depth=1
15-
; CHECK-NEXT: cmp w10, #32
16-
; CHECK-NEXT: b.eq LBB0_6
17-
; CHECK-NEXT: ; %bb.3: ; %loop.header
18-
; CHECK-NEXT: ; in Loop: Header=BB0_1 Depth=1
11+
; CHECK-NEXT: tst w0, w9
12+
; CHECK-NEXT: b.eq LBB0_4
13+
; CHECK-NEXT: ; %bb.2: ; in Loop: Header=BB0_1 Depth=1
14+
; CHECK-NEXT: and w10, w0, #0xff
1915
; CHECK-NEXT: cmp w10, #124
20-
; CHECK-NEXT: b.eq LBB0_7
21-
; CHECK-NEXT: ; %bb.4: ; %loop.latch
16+
; CHECK-NEXT: b.eq LBB0_5
17+
; CHECK-NEXT: ; %bb.3: ; %loop.latch
2218
; CHECK-NEXT: ; in Loop: Header=BB0_1 Depth=1
23-
; CHECK-NEXT: subs w9, w9, #1
19+
; CHECK-NEXT: subs w8, w8, #1
2420
; CHECK-NEXT: b.ne LBB0_1
25-
; CHECK-NEXT: ; %bb.5:
26-
; CHECK-NEXT: mov w8, #20 ; =0x14
27-
; CHECK-NEXT: LBB0_6: ; %common.ret
28-
; CHECK-NEXT: mov w0, w8
21+
; CHECK-NEXT: LBB0_4:
22+
; CHECK-NEXT: mov w0, #20 ; =0x14
2923
; CHECK-NEXT: ret
30-
; CHECK-NEXT: LBB0_7: ; %e2
24+
; CHECK-NEXT: LBB0_5: ; %e2
3125
; CHECK-NEXT: mov w0, #30 ; =0x1e
3226
; CHECK-NEXT: ret
3327
entry:

llvm/test/Transforms/CodeGenPrepare/AArch64/switch-cases-to-branch-and.ll

Lines changed: 34 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -9,11 +9,12 @@ define i32 @switch_with_matching_dests_0_and_pow2_3_cases(i8 %v) {
99
; CHECK: [[LOOP_HEADER]]:
1010
; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
1111
; CHECK-NEXT: [[TMP3:%.*]] = zext i8 [[V]] to i32
12-
; CHECK-NEXT: switch i32 [[TMP3]], label %[[LOOP_LATCH]] [
13-
; CHECK-NEXT: i32 32, label %[[E1:.*]]
14-
; CHECK-NEXT: i32 0, label %[[E1]]
15-
; CHECK-NEXT: i32 124, label %[[E2:.*]]
16-
; CHECK-NEXT: ]
12+
; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[TMP3]], -33
13+
; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 0
14+
; CHECK-NEXT: br i1 [[TMP2]], label %[[E1:.*]], label %[[BB3:.*]]
15+
; CHECK: [[BB3]]:
16+
; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[TMP3]], 124
17+
; CHECK-NEXT: br i1 [[COND]], label %[[E2:.*]], label %[[LOOP_LATCH]]
1718
; CHECK: [[LOOP_LATCH]]:
1819
; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1
1920
; CHECK-NEXT: [[C:%.*]] = icmp eq i32 [[IV_NEXT]], 100
@@ -54,11 +55,12 @@ define i32 @switch_with_matching_dests_0_and_pow2_3_cases_swapped(i8 %v) {
5455
; CHECK: [[LOOP_HEADER]]:
5556
; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
5657
; CHECK-NEXT: [[TMP3:%.*]] = zext i8 [[V]] to i32
57-
; CHECK-NEXT: switch i32 [[TMP3]], label %[[LOOP_LATCH]] [
58-
; CHECK-NEXT: i32 0, label %[[E1:.*]]
59-
; CHECK-NEXT: i32 32, label %[[E1]]
60-
; CHECK-NEXT: i32 124, label %[[E2:.*]]
61-
; CHECK-NEXT: ]
58+
; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[TMP3]], -33
59+
; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 0
60+
; CHECK-NEXT: br i1 [[TMP2]], label %[[E1:.*]], label %[[BB3:.*]]
61+
; CHECK: [[BB3]]:
62+
; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[TMP3]], 124
63+
; CHECK-NEXT: br i1 [[COND]], label %[[E2:.*]], label %[[LOOP_LATCH]]
6264
; CHECK: [[LOOP_LATCH]]:
6365
; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1
6466
; CHECK-NEXT: [[C:%.*]] = icmp eq i32 [[IV_NEXT]], 100
@@ -106,19 +108,20 @@ define i32 @switch_with_matching_dests_0_and_pow2_3_cases_with_phi(i8 %v, i1 %c)
106108
; CHECK: [[LOOP_HEADER]]:
107109
; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, %[[THEN]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
108110
; CHECK-NEXT: [[TMP0:%.*]] = zext i8 [[V]] to i32
109-
; CHECK-NEXT: switch i32 [[TMP0]], label %[[LOOP_LATCH]] [
110-
; CHECK-NEXT: i32 32, label %[[E1]]
111-
; CHECK-NEXT: i32 0, label %[[E1]]
112-
; CHECK-NEXT: i32 124, label %[[E2:.*]]
113-
; CHECK-NEXT: ]
111+
; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[TMP0]], -33
112+
; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 0
113+
; CHECK-NEXT: br i1 [[TMP2]], label %[[E1]], label %[[BB3:.*]]
114+
; CHECK: [[BB3]]:
115+
; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[TMP0]], 124
116+
; CHECK-NEXT: br i1 [[COND]], label %[[E2:.*]], label %[[LOOP_LATCH]]
114117
; CHECK: [[LOOP_LATCH]]:
115118
; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1
116119
; CHECK-NEXT: [[EC:%.*]] = icmp eq i32 [[IV_NEXT]], 100
117120
; CHECK-NEXT: br i1 [[EC]], label %[[E0:.*]], label %[[LOOP_HEADER]]
118121
; CHECK: [[E0]]:
119122
; CHECK-NEXT: ret i32 10
120123
; CHECK: [[E1]]:
121-
; CHECK-NEXT: [[P:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ 20, %[[LOOP_HEADER]] ], [ 20, %[[LOOP_HEADER]] ]
124+
; CHECK-NEXT: [[P:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ 20, %[[LOOP_HEADER]] ]
122125
; CHECK-NEXT: ret i32 [[P]]
123126
; CHECK: [[E2]]:
124127
; CHECK-NEXT: ret i32 30
@@ -214,19 +217,20 @@ define i32 @switch_in_loop_with_matching_dests_0_and_pow2_3_cases(ptr %start) {
214217
; CHECK-NEXT: [[ENTRY:.*]]:
215218
; CHECK-NEXT: br label %[[LOOP:.*]]
216219
; CHECK: [[LOOP]]:
217-
; CHECK-NEXT: [[P:%.*]] = phi ptr [ [[START]], %[[ENTRY]] ], [ [[TMP0:%.*]], %[[LOOP]] ]
220+
; CHECK-NEXT: [[P:%.*]] = phi ptr [ [[START]], %[[ENTRY]] ], [ [[TMP0:%.*]], %[[TMP4:.*]] ]
218221
; CHECK-NEXT: [[TMP0]] = getelementptr inbounds nuw i8, ptr [[P]], i64 1
219222
; CHECK-NEXT: [[L:%.*]] = load i8, ptr [[TMP0]], align 1
220223
; CHECK-NEXT: [[TMP1:%.*]] = zext i8 [[L]] to i32
221-
; CHECK-NEXT: switch i32 [[TMP1]], label %[[LOOP]] [
222-
; CHECK-NEXT: i32 32, label %[[E1:.*]]
223-
; CHECK-NEXT: i32 0, label %[[E1]]
224-
; CHECK-NEXT: i32 124, label %[[E2:.*]]
225-
; CHECK-NEXT: ]
224+
; CHECK-NEXT: [[TMP5:%.*]] = and i32 [[TMP1]], -33
225+
; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i32 [[TMP5]], 0
226+
; CHECK-NEXT: br i1 [[TMP3]], label %[[E1:.*]], label %[[TMP4]]
227+
; CHECK: [[TMP4]]:
228+
; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[TMP1]], 124
229+
; CHECK-NEXT: br i1 [[COND]], label %[[E2:.*]], label %[[LOOP]]
226230
; CHECK: [[E1]]:
227231
; CHECK-NEXT: br label %[[E2]]
228232
; CHECK: [[E2]]:
229-
; CHECK-NEXT: [[TMP2:%.*]] = phi i32 [ -1, %[[E1]] ], [ 0, %[[LOOP]] ]
233+
; CHECK-NEXT: [[TMP2:%.*]] = phi i32 [ -1, %[[E1]] ], [ 0, %[[TMP4]] ]
230234
; CHECK-NEXT: ret i32 [[TMP2]]
231235
;
232236
entry:
@@ -256,20 +260,22 @@ define i32 @switch_in_loop_with_matching_dests_0_and_pow2_4_cases(ptr %start) {
256260
; CHECK-NEXT: [[ENTRY:.*]]:
257261
; CHECK-NEXT: br label %[[LOOP:.*]]
258262
; CHECK: [[LOOP]]:
259-
; CHECK-NEXT: [[P:%.*]] = phi ptr [ [[START]], %[[ENTRY]] ], [ [[TMP0:%.*]], %[[LOOP]] ]
263+
; CHECK-NEXT: [[P:%.*]] = phi ptr [ [[START]], %[[ENTRY]] ], [ [[TMP0:%.*]], %[[TMP4:.*]] ]
260264
; CHECK-NEXT: [[TMP0]] = getelementptr inbounds nuw i8, ptr [[P]], i64 1
261265
; CHECK-NEXT: [[L:%.*]] = load i8, ptr [[TMP0]], align 1
262266
; CHECK-NEXT: [[TMP1:%.*]] = zext i8 [[L]] to i32
267+
; CHECK-NEXT: [[TMP5:%.*]] = and i32 [[TMP1]], -33
268+
; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i32 [[TMP5]], 0
269+
; CHECK-NEXT: br i1 [[TMP3]], label %[[E1:.*]], label %[[TMP4]]
270+
; CHECK: [[TMP4]]:
263271
; CHECK-NEXT: switch i32 [[TMP1]], label %[[LOOP]] [
264-
; CHECK-NEXT: i32 0, label %[[E1:.*]]
265-
; CHECK-NEXT: i32 15, label %[[E1]]
266-
; CHECK-NEXT: i32 32, label %[[E1]]
267272
; CHECK-NEXT: i32 124, label %[[E2:.*]]
273+
; CHECK-NEXT: i32 15, label %[[E1]]
268274
; CHECK-NEXT: ]
269275
; CHECK: [[E1]]:
270276
; CHECK-NEXT: br label %[[E2]]
271277
; CHECK: [[E2]]:
272-
; CHECK-NEXT: [[TMP2:%.*]] = phi i32 [ -1, %[[E1]] ], [ 0, %[[LOOP]] ]
278+
; CHECK-NEXT: [[TMP2:%.*]] = phi i32 [ -1, %[[E1]] ], [ 0, %[[TMP4]] ]
273279
; CHECK-NEXT: ret i32 [[TMP2]]
274280
;
275281
entry:

0 commit comments

Comments
 (0)