@@ -4985,7 +4985,6 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF,
49854985 if (Legal->hasUncountableEarlyExit ())
49864986 return 1 ;
49874987
4988- auto BestKnownTC = getSmallBestKnownTC (PSE, TheLoop);
49894988 const bool HasReductions = !Legal->getReductionVars ().empty ();
49904989
49914990 // If we did not calculate the cost for VF (because the user selected the VF)
@@ -5062,51 +5061,53 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF,
50625061 }
50635062
50645063 unsigned EstimatedVF = getEstimatedRuntimeVF (VF, VScaleForTuning);
5065- unsigned KnownTC = PSE.getSE ()->getSmallConstantTripCount (TheLoop);
5066- if (KnownTC > 0 ) {
5067- // At least one iteration must be scalar when this constraint holds. So the
5068- // maximum available iterations for interleaving is one less.
5069- unsigned AvailableTC =
5070- requiresScalarEpilogue (VF.isVector ()) ? KnownTC - 1 : KnownTC;
5071-
5072- // If trip count is known we select between two prospective ICs, where
5073- // 1) the aggressive IC is capped by the trip count divided by VF
5074- // 2) the conservative IC is capped by the trip count divided by (VF * 2)
5075- // The final IC is selected in a way that the epilogue loop trip count is
5076- // minimized while maximizing the IC itself, so that we either run the
5077- // vector loop at least once if it generates a small epilogue loop, or else
5078- // we run the vector loop at least twice.
5079-
5080- unsigned InterleaveCountUB = bit_floor (
5081- std::max (1u , std::min (AvailableTC / EstimatedVF, MaxInterleaveCount)));
5082- unsigned InterleaveCountLB = bit_floor (std::max (
5083- 1u , std::min (AvailableTC / (EstimatedVF * 2 ), MaxInterleaveCount)));
5084- MaxInterleaveCount = InterleaveCountLB;
5085-
5086- if (InterleaveCountUB != InterleaveCountLB) {
5087- unsigned TailTripCountUB =
5088- (AvailableTC % (EstimatedVF * InterleaveCountUB));
5089- unsigned TailTripCountLB =
5090- (AvailableTC % (EstimatedVF * InterleaveCountLB));
5091- // If both produce same scalar tail, maximize the IC to do the same work
5092- // in fewer vector loop iterations
5093- if (TailTripCountUB == TailTripCountLB)
5094- MaxInterleaveCount = InterleaveCountUB;
5095- }
5096- } else if (BestKnownTC) {
5064+
5065+ // Try to get the exact trip count, or an estimate based on profiling data or
5066+ // ConstantMax from PSE, failing that.
5067+ if (auto BestKnownTC = getSmallBestKnownTC (PSE, TheLoop)) {
50975068 // At least one iteration must be scalar when this constraint holds. So the
50985069 // maximum available iterations for interleaving is one less.
50995070 unsigned AvailableTC = requiresScalarEpilogue (VF.isVector ())
51005071 ? (*BestKnownTC) - 1
51015072 : *BestKnownTC;
51025073
5103- // If trip count is an estimated compile time constant, limit the
5104- // IC to be capped by the trip count divided by VF * 2, such that the vector
5105- // loop runs at least twice to make interleaving seem profitable when there
5106- // is an epilogue loop present. Since exact Trip count is not known we
5107- // choose to be conservative in our IC estimate.
5108- MaxInterleaveCount = bit_floor (std::max (
5074+ unsigned InterleaveCountLB = bit_floor (std::max (
51095075 1u , std::min (AvailableTC / (EstimatedVF * 2 ), MaxInterleaveCount)));
5076+
5077+ if (PSE.getSE ()->getSmallConstantTripCount (TheLoop) > 0 ) {
5078+ // If the best known trip count is exact, we select between two
5079+ // prospective ICs, where
5080+ //
5081+ // 1) the aggressive IC is capped by the trip count divided by VF
5082+ // 2) the conservative IC is capped by the trip count divided by (VF * 2)
5083+ //
5084+ // The final IC is selected in a way that the epilogue loop trip count is
5085+ // minimized while maximizing the IC itself, so that we either run the
5086+ // vector loop at least once if it generates a small epilogue loop, or
5087+ // else we run the vector loop at least twice.
5088+
5089+ unsigned InterleaveCountUB = bit_floor (std::max (
5090+ 1u , std::min (AvailableTC / EstimatedVF, MaxInterleaveCount)));
5091+ MaxInterleaveCount = InterleaveCountLB;
5092+
5093+ if (InterleaveCountUB != InterleaveCountLB) {
5094+ unsigned TailTripCountUB =
5095+ (AvailableTC % (EstimatedVF * InterleaveCountUB));
5096+ unsigned TailTripCountLB =
5097+ (AvailableTC % (EstimatedVF * InterleaveCountLB));
5098+ // If both produce same scalar tail, maximize the IC to do the same work
5099+ // in fewer vector loop iterations
5100+ if (TailTripCountUB == TailTripCountLB)
5101+ MaxInterleaveCount = InterleaveCountUB;
5102+ }
5103+ } else {
5104+ // If trip count is an estimated compile time constant, limit the
5105+ // IC to be capped by the trip count divided by VF * 2, such that the
5106+ // vector loop runs at least twice to make interleaving seem profitable
5107+ // when there is an epilogue loop present. Since exact Trip count is not
5108+ // known we choose to be conservative in our IC estimate.
5109+ MaxInterleaveCount = InterleaveCountLB;
5110+ }
51105111 }
51115112
51125113 assert (MaxInterleaveCount > 0 &&
0 commit comments