@@ -4985,7 +4985,6 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF,
4985
4985
if (Legal->hasUncountableEarlyExit ())
4986
4986
return 1 ;
4987
4987
4988
- auto BestKnownTC = getSmallBestKnownTC (PSE, TheLoop);
4989
4988
const bool HasReductions = !Legal->getReductionVars ().empty ();
4990
4989
4991
4990
// If we did not calculate the cost for VF (because the user selected the VF)
@@ -5062,51 +5061,53 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF,
5062
5061
}
5063
5062
5064
5063
unsigned EstimatedVF = getEstimatedRuntimeVF (VF, VScaleForTuning);
5065
- unsigned KnownTC = PSE.getSE ()->getSmallConstantTripCount (TheLoop);
5066
- if (KnownTC > 0 ) {
5067
- // At least one iteration must be scalar when this constraint holds. So the
5068
- // maximum available iterations for interleaving is one less.
5069
- unsigned AvailableTC =
5070
- requiresScalarEpilogue (VF.isVector ()) ? KnownTC - 1 : KnownTC;
5071
-
5072
- // If trip count is known we select between two prospective ICs, where
5073
- // 1) the aggressive IC is capped by the trip count divided by VF
5074
- // 2) the conservative IC is capped by the trip count divided by (VF * 2)
5075
- // The final IC is selected in a way that the epilogue loop trip count is
5076
- // minimized while maximizing the IC itself, so that we either run the
5077
- // vector loop at least once if it generates a small epilogue loop, or else
5078
- // we run the vector loop at least twice.
5079
-
5080
- unsigned InterleaveCountUB = bit_floor (
5081
- std::max (1u , std::min (AvailableTC / EstimatedVF, MaxInterleaveCount)));
5082
- unsigned InterleaveCountLB = bit_floor (std::max (
5083
- 1u , std::min (AvailableTC / (EstimatedVF * 2 ), MaxInterleaveCount)));
5084
- MaxInterleaveCount = InterleaveCountLB;
5085
-
5086
- if (InterleaveCountUB != InterleaveCountLB) {
5087
- unsigned TailTripCountUB =
5088
- (AvailableTC % (EstimatedVF * InterleaveCountUB));
5089
- unsigned TailTripCountLB =
5090
- (AvailableTC % (EstimatedVF * InterleaveCountLB));
5091
- // If both produce same scalar tail, maximize the IC to do the same work
5092
- // in fewer vector loop iterations
5093
- if (TailTripCountUB == TailTripCountLB)
5094
- MaxInterleaveCount = InterleaveCountUB;
5095
- }
5096
- } else if (BestKnownTC) {
5064
+
5065
+ // Try to get the exact trip count, or an estimate based on profiling data or
5066
+ // ConstantMax from PSE, failing that.
5067
+ if (auto BestKnownTC = getSmallBestKnownTC (PSE, TheLoop)) {
5097
5068
// At least one iteration must be scalar when this constraint holds. So the
5098
5069
// maximum available iterations for interleaving is one less.
5099
5070
unsigned AvailableTC = requiresScalarEpilogue (VF.isVector ())
5100
5071
? (*BestKnownTC) - 1
5101
5072
: *BestKnownTC;
5102
5073
5103
- // If trip count is an estimated compile time constant, limit the
5104
- // IC to be capped by the trip count divided by VF * 2, such that the vector
5105
- // loop runs at least twice to make interleaving seem profitable when there
5106
- // is an epilogue loop present. Since exact Trip count is not known we
5107
- // choose to be conservative in our IC estimate.
5108
- MaxInterleaveCount = bit_floor (std::max (
5074
+ unsigned InterleaveCountLB = bit_floor (std::max (
5109
5075
1u , std::min (AvailableTC / (EstimatedVF * 2 ), MaxInterleaveCount)));
5076
+
5077
+ if (PSE.getSE ()->getSmallConstantTripCount (TheLoop) > 0 ) {
5078
+ // If the best known trip count is exact, we select between two
5079
+ // prospective ICs, where
5080
+ //
5081
+ // 1) the aggressive IC is capped by the trip count divided by VF
5082
+ // 2) the conservative IC is capped by the trip count divided by (VF * 2)
5083
+ //
5084
+ // The final IC is selected in a way that the epilogue loop trip count is
5085
+ // minimized while maximizing the IC itself, so that we either run the
5086
+ // vector loop at least once if it generates a small epilogue loop, or
5087
+ // else we run the vector loop at least twice.
5088
+
5089
+ unsigned InterleaveCountUB = bit_floor (std::max (
5090
+ 1u , std::min (AvailableTC / EstimatedVF, MaxInterleaveCount)));
5091
+ MaxInterleaveCount = InterleaveCountLB;
5092
+
5093
+ if (InterleaveCountUB != InterleaveCountLB) {
5094
+ unsigned TailTripCountUB =
5095
+ (AvailableTC % (EstimatedVF * InterleaveCountUB));
5096
+ unsigned TailTripCountLB =
5097
+ (AvailableTC % (EstimatedVF * InterleaveCountLB));
5098
+ // If both produce same scalar tail, maximize the IC to do the same work
5099
+ // in fewer vector loop iterations
5100
+ if (TailTripCountUB == TailTripCountLB)
5101
+ MaxInterleaveCount = InterleaveCountUB;
5102
+ }
5103
+ } else {
5104
+ // If trip count is an estimated compile time constant, limit the
5105
+ // IC to be capped by the trip count divided by VF * 2, such that the
5106
+ // vector loop runs at least twice to make interleaving seem profitable
5107
+ // when there is an epilogue loop present. Since exact Trip count is not
5108
+ // known we choose to be conservative in our IC estimate.
5109
+ MaxInterleaveCount = InterleaveCountLB;
5110
+ }
5110
5111
}
5111
5112
5112
5113
assert (MaxInterleaveCount > 0 &&
0 commit comments