Skip to content

Commit 094f9f5

Browse files
preamesakiramenai
authored andcommitted
[LSR][TTI][RISCV] Enable terminator folding for RISC-V
If looking for a miscompile revert candidate, look here! The transform being enabled prefers comparing to a loop invariant exit value for a secondary IV over using an otherwise dead primary IV. This increases register pressure (by requiring the exit value to be live through the loop), but reduces the number of instructions within the loop by one. On RISC-V which has a large number of scalar registers, this is generally a profitable transform. We loose the ability to use a beqz on what is typically a count down IV, and pay the cost of computing the exit value on the secondary IV in the loop preheader, but save an add or sub in the loop body. For anything except an extremely short running loop, or one with extreme register pressure, this is profitable. On spec2017, we see a 0.42% geomean improvement in dynamic icount, with no individual workload regressing by more than 0.25%. Code size wise, we trade a (possibly compressible) beqz and a (possibly compressible) addi for a uncompressible beq. We also add instructions in the preheader. Net result is a slight regression overall, but neutral or better inside the loop. Previous versions of this transform had numerous cornercase correctness bugs. All of them ones I can spot by inspection have been fixed, and I have run this through all of spec2017, but there may be further issues lurking. Adding uses to an IV is a fraught thing to do given poison semantics, so this transform is somewhat inherently risky. This patch is a reworked version of D134893 by @eop. That patch has been abandoned since May, so I picked it up, reworked it a bit, and am landing it.
1 parent c06812c commit 094f9f5

File tree

14 files changed

+1888
-763
lines changed

14 files changed

+1888
-763
lines changed

llvm/include/llvm/Analysis/TargetTransformInfo.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -715,6 +715,11 @@ class TargetTransformInfo {
715715
/// cost should return false, otherwise return true.
716716
bool isNumRegsMajorCostOfLSR() const;
717717

718+
/// Return true if LSR should attempts to replace a use of an otherwise dead
719+
/// primary IV in the latch condition with another IV available in the loop.
720+
/// When successful, makes the primary IV dead.
721+
bool shouldFoldTerminatingConditionAfterLSR() const;
722+
718723
/// \returns true if LSR should not optimize a chain that includes \p I.
719724
bool isProfitableLSRChainElement(Instruction *I) const;
720725

@@ -1760,6 +1765,7 @@ class TargetTransformInfo::Concept {
17601765
virtual bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1,
17611766
const TargetTransformInfo::LSRCost &C2) = 0;
17621767
virtual bool isNumRegsMajorCostOfLSR() = 0;
1768+
virtual bool shouldFoldTerminatingConditionAfterLSR() const = 0;
17631769
virtual bool isProfitableLSRChainElement(Instruction *I) = 0;
17641770
virtual bool canMacroFuseCmp() = 0;
17651771
virtual bool canSaveCmp(Loop *L, BranchInst **BI, ScalarEvolution *SE,
@@ -2205,6 +2211,9 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
22052211
bool isNumRegsMajorCostOfLSR() override {
22062212
return Impl.isNumRegsMajorCostOfLSR();
22072213
}
2214+
bool shouldFoldTerminatingConditionAfterLSR() const override {
2215+
return Impl.shouldFoldTerminatingConditionAfterLSR();
2216+
}
22082217
bool isProfitableLSRChainElement(Instruction *I) override {
22092218
return Impl.isProfitableLSRChainElement(I);
22102219
}

llvm/include/llvm/Analysis/TargetTransformInfoImpl.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -231,6 +231,8 @@ class TargetTransformInfoImplBase {
231231

232232
bool isNumRegsMajorCostOfLSR() const { return true; }
233233

234+
bool shouldFoldTerminatingConditionAfterLSR() const { return false; }
235+
234236
bool isProfitableLSRChainElement(Instruction *I) const { return false; }
235237

236238
bool canMacroFuseCmp() const { return false; }

llvm/include/llvm/CodeGen/BasicTTIImpl.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -382,6 +382,11 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
382382
return TargetTransformInfoImplBase::isNumRegsMajorCostOfLSR();
383383
}
384384

385+
bool shouldFoldTerminatingConditionAfterLSR() const {
386+
return TargetTransformInfoImplBase::
387+
shouldFoldTerminatingConditionAfterLSR();
388+
}
389+
385390
bool isProfitableLSRChainElement(Instruction *I) {
386391
return TargetTransformInfoImplBase::isProfitableLSRChainElement(I);
387392
}

llvm/lib/Analysis/TargetTransformInfo.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -402,6 +402,10 @@ bool TargetTransformInfo::isNumRegsMajorCostOfLSR() const {
402402
return TTIImpl->isNumRegsMajorCostOfLSR();
403403
}
404404

405+
bool TargetTransformInfo::shouldFoldTerminatingConditionAfterLSR() const {
406+
return TTIImpl->shouldFoldTerminatingConditionAfterLSR();
407+
}
408+
405409
bool TargetTransformInfo::isProfitableLSRChainElement(Instruction *I) const {
406410
return TTIImpl->isProfitableLSRChainElement(I);
407411
}

llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -359,6 +359,10 @@ class RISCVTTIImpl : public BasicTTIImplBase<RISCVTTIImpl> {
359359

360360
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1,
361361
const TargetTransformInfo::LSRCost &C2);
362+
363+
bool shouldFoldTerminatingConditionAfterLSR() const {
364+
return true;
365+
}
362366
};
363367

364368
} // end namespace llvm

llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -188,8 +188,8 @@ static cl::opt<unsigned> SetupCostDepthLimit(
188188
"lsr-setupcost-depth-limit", cl::Hidden, cl::init(7),
189189
cl::desc("The limit on recursion depth for LSRs setup cost"));
190190

191-
static cl::opt<bool> AllowTerminatingConditionFoldingAfterLSR(
192-
"lsr-term-fold", cl::Hidden, cl::init(false),
191+
static cl::opt<cl::boolOrDefault> AllowTerminatingConditionFoldingAfterLSR(
192+
"lsr-term-fold", cl::Hidden,
193193
cl::desc("Attempt to replace primary IV with other IV."));
194194

195195
static cl::opt<bool> AllowDropSolutionIfLessProfitable(
@@ -6949,7 +6949,18 @@ static bool ReduceLoopStrength(Loop *L, IVUsers &IU, ScalarEvolution &SE,
69496949
}
69506950
}
69516951

6952-
if (AllowTerminatingConditionFoldingAfterLSR) {
6952+
const bool EnableFormTerm = [&] {
6953+
switch (AllowTerminatingConditionFoldingAfterLSR) {
6954+
case cl::BOU_TRUE:
6955+
return true;
6956+
case cl::BOU_FALSE:
6957+
return false;
6958+
case cl::BOU_UNSET:
6959+
return TTI.shouldFoldTerminatingConditionAfterLSR();
6960+
}
6961+
}();
6962+
6963+
if (EnableFormTerm) {
69536964
if (auto Opt = canFoldTermCondOfLoop(L, SE, DT, LI)) {
69546965
auto [ToFold, ToHelpFold, TermValueS, MustDrop] = *Opt;
69556966

llvm/test/CodeGen/RISCV/branch-on-zero.ll

Lines changed: 25 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -120,36 +120,45 @@ define i32 @test_lshr2(ptr nocapture %x, ptr nocapture readonly %y, i32 %n) {
120120
; RV32-LABEL: test_lshr2:
121121
; RV32: # %bb.0: # %entry
122122
; RV32-NEXT: srli a2, a2, 2
123-
; RV32-NEXT: beqz a2, .LBB3_2
124-
; RV32-NEXT: .LBB3_1: # %while.body
123+
; RV32-NEXT: beqz a2, .LBB3_3
124+
; RV32-NEXT: # %bb.1: # %while.body.preheader
125+
; RV32-NEXT: slli a2, a2, 2
126+
; RV32-NEXT: add a2, a1, a2
127+
; RV32-NEXT: .LBB3_2: # %while.body
125128
; RV32-NEXT: # =>This Inner Loop Header: Depth=1
126129
; RV32-NEXT: lw a3, 0(a1)
127-
; RV32-NEXT: addi a1, a1, 4
130+
; RV32-NEXT: addi a4, a1, 4
128131
; RV32-NEXT: slli a3, a3, 1
129-
; RV32-NEXT: addi a4, a0, 4
130-
; RV32-NEXT: addi a2, a2, -1
132+
; RV32-NEXT: addi a1, a0, 4
131133
; RV32-NEXT: sw a3, 0(a0)
132-
; RV32-NEXT: mv a0, a4
133-
; RV32-NEXT: bnez a2, .LBB3_1
134-
; RV32-NEXT: .LBB3_2: # %while.end
134+
; RV32-NEXT: mv a0, a1
135+
; RV32-NEXT: mv a1, a4
136+
; RV32-NEXT: bne a4, a2, .LBB3_2
137+
; RV32-NEXT: .LBB3_3: # %while.end
135138
; RV32-NEXT: li a0, 0
136139
; RV32-NEXT: ret
137140
;
138141
; RV64-LABEL: test_lshr2:
139142
; RV64: # %bb.0: # %entry
140143
; RV64-NEXT: srliw a2, a2, 2
141-
; RV64-NEXT: beqz a2, .LBB3_2
142-
; RV64-NEXT: .LBB3_1: # %while.body
144+
; RV64-NEXT: beqz a2, .LBB3_3
145+
; RV64-NEXT: # %bb.1: # %while.body.preheader
146+
; RV64-NEXT: addi a2, a2, -1
147+
; RV64-NEXT: slli a2, a2, 32
148+
; RV64-NEXT: srli a2, a2, 30
149+
; RV64-NEXT: add a2, a2, a1
150+
; RV64-NEXT: addi a2, a2, 4
151+
; RV64-NEXT: .LBB3_2: # %while.body
143152
; RV64-NEXT: # =>This Inner Loop Header: Depth=1
144153
; RV64-NEXT: lw a3, 0(a1)
145-
; RV64-NEXT: addi a1, a1, 4
154+
; RV64-NEXT: addi a4, a1, 4
146155
; RV64-NEXT: slli a3, a3, 1
147-
; RV64-NEXT: addi a4, a0, 4
148-
; RV64-NEXT: addiw a2, a2, -1
156+
; RV64-NEXT: addi a1, a0, 4
149157
; RV64-NEXT: sw a3, 0(a0)
150-
; RV64-NEXT: mv a0, a4
151-
; RV64-NEXT: bnez a2, .LBB3_1
152-
; RV64-NEXT: .LBB3_2: # %while.end
158+
; RV64-NEXT: mv a0, a1
159+
; RV64-NEXT: mv a1, a4
160+
; RV64-NEXT: bne a4, a2, .LBB3_2
161+
; RV64-NEXT: .LBB3_3: # %while.end
153162
; RV64-NEXT: li a0, 0
154163
; RV64-NEXT: ret
155164
entry:

llvm/test/CodeGen/RISCV/riscv-codegenprepare-asm.ll

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,16 +8,19 @@
88
define void @test1(ptr nocapture noundef %a, i32 noundef signext %n) {
99
; CHECK-LABEL: test1:
1010
; CHECK: # %bb.0: # %entry
11-
; CHECK-NEXT: blez a1, .LBB0_2
12-
; CHECK-NEXT: .LBB0_1: # %for.body
11+
; CHECK-NEXT: blez a1, .LBB0_3
12+
; CHECK-NEXT: # %bb.1: # %for.body.preheader
13+
; CHECK-NEXT: slli a1, a1, 32
14+
; CHECK-NEXT: srli a1, a1, 30
15+
; CHECK-NEXT: add a1, a0, a1
16+
; CHECK-NEXT: .LBB0_2: # %for.body
1317
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
1418
; CHECK-NEXT: lw a2, 0(a0)
1519
; CHECK-NEXT: addiw a2, a2, 4
1620
; CHECK-NEXT: sw a2, 0(a0)
17-
; CHECK-NEXT: addi a1, a1, -1
1821
; CHECK-NEXT: addi a0, a0, 4
19-
; CHECK-NEXT: bnez a1, .LBB0_1
20-
; CHECK-NEXT: .LBB0_2: # %for.cond.cleanup
22+
; CHECK-NEXT: bne a0, a1, .LBB0_2
23+
; CHECK-NEXT: .LBB0_3: # %for.cond.cleanup
2124
; CHECK-NEXT: ret
2225
entry:
2326
%cmp3 = icmp sgt i32 %n, 0

0 commit comments

Comments
 (0)