Skip to content

Commit e9df81f

Browse files
committed
Moves check to NoScalarEpilogueNeeded
1 parent 45541a4 commit e9df81f

File tree

2 files changed

+20
-20
lines changed

2 files changed

+20
-20
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1872,6 +1872,16 @@ bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) {
18721872
}
18731873

18741874
bool LoopVectorizationLegality::canFoldTailByMasking() const {
1875+
// The only loops we can vectorize without a scalar epilogue, are loops with
1876+
// a bottom-test and a single exiting block. We'd have to handle the fact
1877+
// that not every instruction executes on the last iteration. This will
1878+
// require a lane mask which varies through the vector loop body. (TODO)
1879+
if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
1880+
LLVM_DEBUG(
1881+
dbgs()
1882+
<< "LV: Cannot fold tail by masking. Requires a singe latch exit\n");
1883+
return false;
1884+
}
18751885

18761886
LLVM_DEBUG(dbgs() << "LV: checking if tail can be folded by masking.\n");
18771887

@@ -1924,16 +1934,6 @@ bool LoopVectorizationLegality::canFoldTailByMasking() const {
19241934
}
19251935
}
19261936

1927-
// The only loops we can vectorize without a scalar epilogue, are loops with
1928-
// a bottom-test and a single exiting block. We'd have to handle the fact
1929-
// that not every instruction executes on the last iteration. This will
1930-
// require a lane mask which varies through the vector loop body. (TODO)
1931-
if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
1932-
LLVM_DEBUG(
1933-
dbgs()
1934-
<< "LV: Cannot fold tail by masking. Requires a singe latch exit\n");
1935-
return false;
1936-
}
19371937
LLVM_DEBUG(dbgs() << "LV: can fold tail by masking.\n");
19381938

19391939
return true;

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -4016,14 +4016,17 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
40164016
}
40174017

40184018
auto NoScalarEpilogueNeeded = [this, &UserIC](unsigned MaxVF) {
4019+
if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch() &&
4020+
!Legal->hasUncountableEarlyExit())
4021+
return false;
40194022
unsigned MaxVFtimesIC = UserIC ? MaxVF * UserIC : MaxVF;
40204023
ScalarEvolution *SE = PSE.getSE();
4021-
// Currently only loops with countable exits are vectorized, but calling
4022-
// getSymbolicMaxBackedgeTakenCount allows enablement work for loops with
4023-
// uncountable exits whilst also ensuring the symbolic maximum and known
4024-
// back-edge taken count remain identical for loops with countable exits.
4024+
// Calling getSymbolicMaxBackedgeTakenCount enables support for loops
4025+
// with uncountable exits. For countable loops, the symbolic maximum must
4026+
// remain identical to the known back-edge taken count.
40254027
const SCEV *BackedgeTakenCount = PSE.getSymbolicMaxBackedgeTakenCount();
4026-
assert(BackedgeTakenCount == PSE.getBackedgeTakenCount() &&
4028+
assert((Legal->hasUncountableEarlyExit() ||
4029+
BackedgeTakenCount == PSE.getBackedgeTakenCount()) &&
40274030
"Invalid loop count");
40284031
const SCEV *ExitCount = SE->getAddExpr(
40294032
BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
@@ -4033,9 +4036,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
40334036
return Rem->isZero();
40344037
};
40354038

4036-
bool HasSingleLatchExit =
4037-
TheLoop->getExitingBlock() == TheLoop->getLoopLatch();
4038-
if (HasSingleLatchExit && MaxPowerOf2RuntimeVF > 0u) {
4039+
if (MaxPowerOf2RuntimeVF > 0u) {
40394040
assert((UserVF.isNonZero() || isPowerOf2_32(*MaxPowerOf2RuntimeVF)) &&
40404041
"MaxFixedVF must be a power of 2");
40414042
if (NoScalarEpilogueNeeded(*MaxPowerOf2RuntimeVF)) {
@@ -4046,8 +4047,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
40464047
}
40474048

40484049
auto ExpectedTC = getSmallBestKnownTC(PSE, TheLoop);
4049-
if (HasSingleLatchExit && ExpectedTC &&
4050-
ExpectedTC <= TTI.getMinTripCountTailFoldingThreshold()) {
4050+
if (ExpectedTC && ExpectedTC <= TTI.getMinTripCountTailFoldingThreshold()) {
40514051
if (MaxPowerOf2RuntimeVF > 0u) {
40524052
// If we have a low-trip-count, and the fixed-width VF is known to divide
40534053
// the trip count but the scalable factor does not, use the fixed-width

0 commit comments

Comments
 (0)