@@ -4930,7 +4930,6 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
49304930 if (Legal->hasUncountableEarlyExit ())
49314931 return 1 ;
49324932
4933- auto BestKnownTC = getSmallBestKnownTC (PSE, TheLoop);
49344933 const bool HasReductions = !Legal->getReductionVars ().empty ();
49354934
49364935 // If we did not calculate the cost for VF (because the user selected the VF)
@@ -5006,25 +5005,33 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
50065005 }
50075006
50085007 unsigned EstimatedVF = getEstimatedRuntimeVF (VF, VScaleForTuning);
5009- unsigned KnownTC = PSE.getSE ()->getSmallConstantTripCount (TheLoop);
5010- if (KnownTC > 0 ) {
5011- // At least one iteration must be scalar when this constraint holds. So the
5012- // maximum available iterations for interleaving is one less.
5013- unsigned AvailableTC =
5014- requiresScalarEpilogue (VF.isVector ()) ? KnownTC - 1 : KnownTC;
5015-
5016- // If trip count is known we select between two prospective ICs, where
5008+
5009+ // Try to get the exact trip count, or an estimate based on profiling data or
5010+ // ConstantMax from PSE, failing that.
5011+ if (auto BestKnownTC = getSmallBestKnownTC (PSE, TheLoop)) {
5012+ // At least one iteration must be scalar when this constraint holds. So the
5013+ // maximum available iterations for interleaving is one less.
5014+ unsigned AvailableTC = requiresScalarEpilogue (VF.isVector ())
5015+ ? (*BestKnownTC) - 1
5016+ : *BestKnownTC;
5017+
5018+ unsigned InterleaveCountLB = bit_floor (std::max (
5019+ 1u , std::min (AvailableTC / (EstimatedVF * 2 ), MaxInterleaveCount)));
5020+
5021+ if (PSE.getSE ()->getSmallConstantTripCount (TheLoop) > 0 ) {
5022+ // If the estimated trip count is actually an exact one we select between
5023+ // two prospective ICs, where
5024+ //
50175025 // 1) the aggressive IC is capped by the trip count divided by VF
50185026 // 2) the conservative IC is capped by the trip count divided by (VF * 2)
5027+ //
50195028 // The final IC is selected in a way that the epilogue loop trip count is
50205029 // minimized while maximizing the IC itself, so that we either run the
5021- // vector loop at least once if it generates a small epilogue loop, or else
5022- // we run the vector loop at least twice.
5030+ // vector loop at least once if it generates a small epilogue loop, or
5031+ // else we run the vector loop at least twice.
50235032
5024- unsigned InterleaveCountUB = bit_floor (
5025- std::max (1u , std::min (AvailableTC / EstimatedVF, MaxInterleaveCount)));
5026- unsigned InterleaveCountLB = bit_floor (std::max (
5027- 1u , std::min (AvailableTC / (EstimatedVF * 2 ), MaxInterleaveCount)));
5033+ unsigned InterleaveCountUB = bit_floor (std::max (
5034+ 1u , std::min (AvailableTC / EstimatedVF, MaxInterleaveCount)));
50285035 MaxInterleaveCount = InterleaveCountLB;
50295036
50305037 if (InterleaveCountUB != InterleaveCountLB) {
@@ -5037,20 +5044,14 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
50375044 if (TailTripCountUB == TailTripCountLB)
50385045 MaxInterleaveCount = InterleaveCountUB;
50395046 }
5040- } else if (BestKnownTC) {
5041- // At least one iteration must be scalar when this constraint holds. So the
5042- // maximum available iterations for interleaving is one less.
5043- unsigned AvailableTC = requiresScalarEpilogue (VF.isVector ())
5044- ? (*BestKnownTC) - 1
5045- : *BestKnownTC;
5046-
5047- // If trip count is an estimated compile time constant, limit the
5048- // IC to be capped by the trip count divided by VF * 2, such that the vector
5049- // loop runs at least twice to make interleaving seem profitable when there
5050- // is an epilogue loop present. Since exact Trip count is not known we
5051- // choose to be conservative in our IC estimate.
5052- MaxInterleaveCount = bit_floor (std::max (
5053- 1u , std::min (AvailableTC / (EstimatedVF * 2 ), MaxInterleaveCount)));
5047+ } else {
5048+ // If trip count is an estimated compile time constant, limit the
5049+ // IC to be capped by the trip count divided by VF * 2, such that the
5050+ // vector loop runs at least twice to make interleaving seem profitable
5051+ // when there is an epilogue loop present. Since exact Trip count is not
5052+ // known we choose to be conservative in our IC estimate.
5053+ MaxInterleaveCount = InterleaveCountLB;
5054+ }
50545055 }
50555056
50565057 assert (MaxInterleaveCount > 0 &&
0 commit comments