Skip to content

Commit 4f0be94

Browse files
authored
[LV] Improve code in selectInterleaveCount (NFC) (#128002)
Use the fact that getSmallBestKnownTC returns an exact trip count, if possible, and falls back to returning an estimate, to factor some code in selectInterleaveCount.
1 parent 89f8267 commit 4f0be94

File tree

1 file changed

+40
-39
lines changed

1 file changed

+40
-39
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 40 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -4985,7 +4985,6 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF,
49854985
if (Legal->hasUncountableEarlyExit())
49864986
return 1;
49874987

4988-
auto BestKnownTC = getSmallBestKnownTC(PSE, TheLoop);
49894988
const bool HasReductions = !Legal->getReductionVars().empty();
49904989

49914990
// If we did not calculate the cost for VF (because the user selected the VF)
@@ -5062,51 +5061,53 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF,
50625061
}
50635062

50645063
unsigned EstimatedVF = getEstimatedRuntimeVF(VF, VScaleForTuning);
5065-
unsigned KnownTC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
5066-
if (KnownTC > 0) {
5067-
// At least one iteration must be scalar when this constraint holds. So the
5068-
// maximum available iterations for interleaving is one less.
5069-
unsigned AvailableTC =
5070-
requiresScalarEpilogue(VF.isVector()) ? KnownTC - 1 : KnownTC;
5071-
5072-
// If trip count is known we select between two prospective ICs, where
5073-
// 1) the aggressive IC is capped by the trip count divided by VF
5074-
// 2) the conservative IC is capped by the trip count divided by (VF * 2)
5075-
// The final IC is selected in a way that the epilogue loop trip count is
5076-
// minimized while maximizing the IC itself, so that we either run the
5077-
// vector loop at least once if it generates a small epilogue loop, or else
5078-
// we run the vector loop at least twice.
5079-
5080-
unsigned InterleaveCountUB = bit_floor(
5081-
std::max(1u, std::min(AvailableTC / EstimatedVF, MaxInterleaveCount)));
5082-
unsigned InterleaveCountLB = bit_floor(std::max(
5083-
1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount)));
5084-
MaxInterleaveCount = InterleaveCountLB;
5085-
5086-
if (InterleaveCountUB != InterleaveCountLB) {
5087-
unsigned TailTripCountUB =
5088-
(AvailableTC % (EstimatedVF * InterleaveCountUB));
5089-
unsigned TailTripCountLB =
5090-
(AvailableTC % (EstimatedVF * InterleaveCountLB));
5091-
// If both produce same scalar tail, maximize the IC to do the same work
5092-
// in fewer vector loop iterations
5093-
if (TailTripCountUB == TailTripCountLB)
5094-
MaxInterleaveCount = InterleaveCountUB;
5095-
}
5096-
} else if (BestKnownTC) {
5064+
5065+
// Try to get the exact trip count, or an estimate based on profiling data or
5066+
// ConstantMax from PSE, failing that.
5067+
if (auto BestKnownTC = getSmallBestKnownTC(PSE, TheLoop)) {
50975068
// At least one iteration must be scalar when this constraint holds. So the
50985069
// maximum available iterations for interleaving is one less.
50995070
unsigned AvailableTC = requiresScalarEpilogue(VF.isVector())
51005071
? (*BestKnownTC) - 1
51015072
: *BestKnownTC;
51025073

5103-
// If trip count is an estimated compile time constant, limit the
5104-
// IC to be capped by the trip count divided by VF * 2, such that the vector
5105-
// loop runs at least twice to make interleaving seem profitable when there
5106-
// is an epilogue loop present. Since exact Trip count is not known we
5107-
// choose to be conservative in our IC estimate.
5108-
MaxInterleaveCount = bit_floor(std::max(
5074+
unsigned InterleaveCountLB = bit_floor(std::max(
51095075
1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount)));
5076+
5077+
if (PSE.getSE()->getSmallConstantTripCount(TheLoop) > 0) {
5078+
// If the best known trip count is exact, we select between two
5079+
// prospective ICs, where
5080+
//
5081+
// 1) the aggressive IC is capped by the trip count divided by VF
5082+
// 2) the conservative IC is capped by the trip count divided by (VF * 2)
5083+
//
5084+
// The final IC is selected in a way that the epilogue loop trip count is
5085+
// minimized while maximizing the IC itself, so that we either run the
5086+
// vector loop at least once if it generates a small epilogue loop, or
5087+
// else we run the vector loop at least twice.
5088+
5089+
unsigned InterleaveCountUB = bit_floor(std::max(
5090+
1u, std::min(AvailableTC / EstimatedVF, MaxInterleaveCount)));
5091+
MaxInterleaveCount = InterleaveCountLB;
5092+
5093+
if (InterleaveCountUB != InterleaveCountLB) {
5094+
unsigned TailTripCountUB =
5095+
(AvailableTC % (EstimatedVF * InterleaveCountUB));
5096+
unsigned TailTripCountLB =
5097+
(AvailableTC % (EstimatedVF * InterleaveCountLB));
5098+
// If both produce same scalar tail, maximize the IC to do the same work
5099+
// in fewer vector loop iterations
5100+
if (TailTripCountUB == TailTripCountLB)
5101+
MaxInterleaveCount = InterleaveCountUB;
5102+
}
5103+
} else {
5104+
// If trip count is an estimated compile time constant, limit the
5105+
// IC to be capped by the trip count divided by VF * 2, such that the
5106+
// vector loop runs at least twice to make interleaving seem profitable
5107+
// when there is an epilogue loop present. Since exact Trip count is not
5108+
// known we choose to be conservative in our IC estimate.
5109+
MaxInterleaveCount = InterleaveCountLB;
5110+
}
51105111
}
51115112

51125113
assert(MaxInterleaveCount > 0 &&

0 commit comments

Comments
 (0)