Skip to content

Commit f8c2c4f

Browse files
[LSR] Account for hardware loop instructions (#147958)
A hardware loop instruction combines a subtract, compare with zero, and branch. We currently account for the compare and branch being combined into one in Cost::RateFormula, as part of more general handling for compare-branch-zero, but don't account for the subtract, leading to suboptimal decisions in some cases. Fix this in Cost::RateRegister by noticing when we have such a subtract and discounting the AddRecCost in such a case.
1 parent cfcda5d commit f8c2c4f

File tree

4 files changed

+378
-42
lines changed

4 files changed

+378
-42
lines changed

llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp

Lines changed: 50 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -521,6 +521,8 @@ struct Formula {
521521

522522
bool hasZeroEnd() const;
523523

524+
bool countsDownToZero() const;
525+
524526
size_t getNumRegs() const;
525527
Type *getType() const;
526528

@@ -705,6 +707,16 @@ bool Formula::hasZeroEnd() const {
705707
return true;
706708
}
707709

710+
bool Formula::countsDownToZero() const {
711+
if (!hasZeroEnd())
712+
return false;
713+
assert(BaseRegs.size() == 1 && "hasZeroEnd should mean one BaseReg");
714+
const APInt *StepInt;
715+
if (!match(BaseRegs[0], m_scev_AffineAddRec(m_SCEV(), m_scev_APInt(StepInt))))
716+
return false;
717+
return StepInt->isNegative();
718+
}
719+
708720
/// Return the total number of register operands used by this formula. This does
709721
/// not include register uses implied by non-constant addrec strides.
710722
size_t Formula::getNumRegs() const {
@@ -1227,20 +1239,21 @@ class Cost {
12271239
return C.NumRegs == ~0u;
12281240
}
12291241

1230-
void RateFormula(const Formula &F,
1231-
SmallPtrSetImpl<const SCEV *> &Regs,
1232-
const DenseSet<const SCEV *> &VisitedRegs,
1233-
const LSRUse &LU,
1242+
void RateFormula(const Formula &F, SmallPtrSetImpl<const SCEV *> &Regs,
1243+
const DenseSet<const SCEV *> &VisitedRegs, const LSRUse &LU,
1244+
bool HardwareLoopProfitable,
12341245
SmallPtrSetImpl<const SCEV *> *LoserRegs = nullptr);
12351246

12361247
void print(raw_ostream &OS) const;
12371248
void dump() const;
12381249

12391250
private:
12401251
void RateRegister(const Formula &F, const SCEV *Reg,
1241-
SmallPtrSetImpl<const SCEV *> &Regs);
1252+
SmallPtrSetImpl<const SCEV *> &Regs, const LSRUse &LU,
1253+
bool HardwareLoopProfitable);
12421254
void RatePrimaryRegister(const Formula &F, const SCEV *Reg,
12431255
SmallPtrSetImpl<const SCEV *> &Regs,
1256+
const LSRUse &LU, bool HardwareLoopProfitable,
12441257
SmallPtrSetImpl<const SCEV *> *LoserRegs);
12451258
};
12461259

@@ -1383,7 +1396,8 @@ static unsigned getSetupCost(const SCEV *Reg, unsigned Depth) {
13831396

13841397
/// Tally up interesting quantities from the given register.
13851398
void Cost::RateRegister(const Formula &F, const SCEV *Reg,
1386-
SmallPtrSetImpl<const SCEV *> &Regs) {
1399+
SmallPtrSetImpl<const SCEV *> &Regs, const LSRUse &LU,
1400+
bool HardwareLoopProfitable) {
13871401
if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Reg)) {
13881402
// If this is an addrec for another loop, it should be an invariant
13891403
// with respect to L since L is the innermost loop (at least
@@ -1419,13 +1433,18 @@ void Cost::RateRegister(const Formula &F, const SCEV *Reg,
14191433
SE->isLoopInvariant(Start, L)))
14201434
LoopCost = 0;
14211435
}
1436+
// If the loop counts down to zero and we'll be using a hardware loop then
1437+
// the addrec will be combined into the hardware loop instruction.
1438+
if (LU.Kind == LSRUse::ICmpZero && F.countsDownToZero() &&
1439+
HardwareLoopProfitable)
1440+
LoopCost = 0;
14221441
C.AddRecCost += LoopCost;
14231442

14241443
// Add the step value register, if it needs one.
14251444
// TODO: The non-affine case isn't precisely modeled here.
14261445
if (!AR->isAffine() || !isa<SCEVConstant>(AR->getOperand(1))) {
14271446
if (!Regs.count(AR->getOperand(1))) {
1428-
RateRegister(F, AR->getOperand(1), Regs);
1447+
RateRegister(F, AR->getOperand(1), Regs, LU, HardwareLoopProfitable);
14291448
if (isLoser())
14301449
return;
14311450
}
@@ -1448,22 +1467,22 @@ void Cost::RateRegister(const Formula &F, const SCEV *Reg,
14481467
/// one of those regs an instant loser.
14491468
void Cost::RatePrimaryRegister(const Formula &F, const SCEV *Reg,
14501469
SmallPtrSetImpl<const SCEV *> &Regs,
1470+
const LSRUse &LU, bool HardwareLoopProfitable,
14511471
SmallPtrSetImpl<const SCEV *> *LoserRegs) {
14521472
if (LoserRegs && LoserRegs->count(Reg)) {
14531473
Lose();
14541474
return;
14551475
}
14561476
if (Regs.insert(Reg).second) {
1457-
RateRegister(F, Reg, Regs);
1477+
RateRegister(F, Reg, Regs, LU, HardwareLoopProfitable);
14581478
if (LoserRegs && isLoser())
14591479
LoserRegs->insert(Reg);
14601480
}
14611481
}
14621482

1463-
void Cost::RateFormula(const Formula &F,
1464-
SmallPtrSetImpl<const SCEV *> &Regs,
1483+
void Cost::RateFormula(const Formula &F, SmallPtrSetImpl<const SCEV *> &Regs,
14651484
const DenseSet<const SCEV *> &VisitedRegs,
1466-
const LSRUse &LU,
1485+
const LSRUse &LU, bool HardwareLoopProfitable,
14671486
SmallPtrSetImpl<const SCEV *> *LoserRegs) {
14681487
if (isLoser())
14691488
return;
@@ -1477,7 +1496,8 @@ void Cost::RateFormula(const Formula &F,
14771496
Lose();
14781497
return;
14791498
}
1480-
RatePrimaryRegister(F, ScaledReg, Regs, LoserRegs);
1499+
RatePrimaryRegister(F, ScaledReg, Regs, LU, HardwareLoopProfitable,
1500+
LoserRegs);
14811501
if (isLoser())
14821502
return;
14831503
}
@@ -1486,7 +1506,8 @@ void Cost::RateFormula(const Formula &F,
14861506
Lose();
14871507
return;
14881508
}
1489-
RatePrimaryRegister(F, BaseReg, Regs, LoserRegs);
1509+
RatePrimaryRegister(F, BaseReg, Regs, LU, HardwareLoopProfitable,
1510+
LoserRegs);
14901511
if (isLoser())
14911512
return;
14921513
}
@@ -2112,6 +2133,7 @@ class LSRInstance {
21122133
TTI::AddressingModeKind AMK;
21132134
mutable SCEVExpander Rewriter;
21142135
bool Changed = false;
2136+
bool HardwareLoopProfitable = false;
21152137

21162138
/// This is the insert position that the current loop's induction variable
21172139
/// increment should be placed. In simple loops, this is the latch block's
@@ -3592,7 +3614,8 @@ void LSRInstance::CollectFixupsAndInitialFormulae() {
35923614
if (!VisitedLSRUse.count(LUIdx) && !LF.isUseFullyOutsideLoop(L)) {
35933615
Formula F;
35943616
F.initialMatch(S, L, SE);
3595-
BaselineCost.RateFormula(F, Regs, VisitedRegs, LU);
3617+
BaselineCost.RateFormula(F, Regs, VisitedRegs, LU,
3618+
HardwareLoopProfitable);
35963619
VisitedLSRUse.insert(LUIdx);
35973620
}
35983621

@@ -4730,7 +4753,8 @@ void LSRInstance::FilterOutUndesirableDedicatedRegisters() {
47304753
// the corresponding bad register from the Regs set.
47314754
Cost CostF(L, SE, TTI, AMK);
47324755
Regs.clear();
4733-
CostF.RateFormula(F, Regs, VisitedRegs, LU, &LoserRegs);
4756+
CostF.RateFormula(F, Regs, VisitedRegs, LU, HardwareLoopProfitable,
4757+
&LoserRegs);
47344758
if (CostF.isLoser()) {
47354759
// During initial formula generation, undesirable formulae are generated
47364760
// by uses within other loops that have some non-trivial address mode or
@@ -4763,7 +4787,8 @@ void LSRInstance::FilterOutUndesirableDedicatedRegisters() {
47634787

47644788
Cost CostBest(L, SE, TTI, AMK);
47654789
Regs.clear();
4766-
CostBest.RateFormula(Best, Regs, VisitedRegs, LU);
4790+
CostBest.RateFormula(Best, Regs, VisitedRegs, LU,
4791+
HardwareLoopProfitable);
47674792
if (CostF.isLess(CostBest))
47684793
std::swap(F, Best);
47694794
LLVM_DEBUG(dbgs() << " Filtering out formula "; F.print(dbgs());
@@ -5021,9 +5046,9 @@ void LSRInstance::NarrowSearchSpaceByFilterFormulaWithSameScaledReg() {
50215046
Cost CostFA(L, SE, TTI, AMK);
50225047
Cost CostFB(L, SE, TTI, AMK);
50235048
Regs.clear();
5024-
CostFA.RateFormula(FA, Regs, VisitedRegs, LU);
5049+
CostFA.RateFormula(FA, Regs, VisitedRegs, LU, HardwareLoopProfitable);
50255050
Regs.clear();
5026-
CostFB.RateFormula(FB, Regs, VisitedRegs, LU);
5051+
CostFB.RateFormula(FB, Regs, VisitedRegs, LU, HardwareLoopProfitable);
50275052
return CostFA.isLess(CostFB);
50285053
};
50295054

@@ -5428,7 +5453,7 @@ void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
54285453
// the current best, prune the search at that point.
54295454
NewCost = CurCost;
54305455
NewRegs = CurRegs;
5431-
NewCost.RateFormula(F, NewRegs, VisitedRegs, LU);
5456+
NewCost.RateFormula(F, NewRegs, VisitedRegs, LU, HardwareLoopProfitable);
54325457
if (NewCost.isLess(SolutionCost)) {
54335458
Workspace.push_back(&F);
54345459
if (Workspace.size() != Uses.size()) {
@@ -6133,6 +6158,12 @@ LSRInstance::LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE,
61336158
L->getHeader()->printAsOperand(dbgs(), /*PrintType=*/false);
61346159
dbgs() << ":\n");
61356160

6161+
// Check if we expect this loop to use a hardware loop instruction, which will
6162+
// be used when calculating the costs of formulas.
6163+
HardwareLoopInfo HWLoopInfo(L);
6164+
HardwareLoopProfitable =
6165+
TTI.isHardwareLoopProfitable(L, SE, AC, &TLI, HWLoopInfo);
6166+
61366167
// Configure SCEVExpander already now, so the correct mode is used for
61376168
// isSafeToExpand() checks.
61386169
#if LLVM_ENABLE_ABI_BREAKING_CHECKS

0 commit comments

Comments
 (0)