@@ -5008,35 +5008,35 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
50085008 unsigned EstimatedVF = getEstimatedRuntimeVF (VF, VScaleForTuning);
50095009 unsigned KnownTC = PSE.getSE ()->getSmallConstantTripCount (TheLoop);
50105010 if (KnownTC > 0 ) {
5011- // At least one iteration must be scalar when this constraint holds. So the
5012- // maximum available iterations for interleaving is one less.
5013- unsigned AvailableTC =
5014- requiresScalarEpilogue (VF.isVector ()) ? KnownTC - 1 : KnownTC;
5015-
5016- // If trip count is known we select between two prospective ICs, where
5017- // 1) the aggressive IC is capped by the trip count divided by VF
5018- // 2) the conservative IC is capped by the trip count divided by (VF * 2)
5019- // The final IC is selected in a way that the epilogue loop trip count is
5020- // minimized while maximizing the IC itself, so that we either run the
5021- // vector loop at least once if it generates a small epilogue loop, or else
5022- // we run the vector loop at least twice.
5023-
5024- unsigned InterleaveCountUB = bit_floor (
5025- std::max (1u , std::min (AvailableTC / EstimatedVF, MaxInterleaveCount)));
5026- unsigned InterleaveCountLB = bit_floor (std::max (
5027- 1u , std::min (AvailableTC / (EstimatedVF * 2 ), MaxInterleaveCount)));
5028- MaxInterleaveCount = InterleaveCountLB;
5029-
5030- if (InterleaveCountUB != InterleaveCountLB) {
5031- unsigned TailTripCountUB =
5032- (AvailableTC % (EstimatedVF * InterleaveCountUB));
5033- unsigned TailTripCountLB =
5034- (AvailableTC % (EstimatedVF * InterleaveCountLB));
5035- // If both produce same scalar tail, maximize the IC to do the same work
5036- // in fewer vector loop iterations
5037- if (TailTripCountUB == TailTripCountLB)
5038- MaxInterleaveCount = InterleaveCountUB;
5039- }
5011+ // At least one iteration must be scalar when this constraint holds. So the
5012+ // maximum available iterations for interleaving is one less.
5013+ unsigned AvailableTC =
5014+ requiresScalarEpilogue (VF.isVector ()) ? KnownTC - 1 : KnownTC;
5015+
5016+ // If trip count is known we select between two prospective ICs, where
5017+ // 1) the aggressive IC is capped by the trip count divided by VF
5018+ // 2) the conservative IC is capped by the trip count divided by (VF * 2)
5019+ // The final IC is selected in a way that the epilogue loop trip count is
5020+ // minimized while maximizing the IC itself, so that we either run the
5021+ // vector loop at least once if it generates a small epilogue loop, or else
5022+ // we run the vector loop at least twice.
5023+
5024+ unsigned InterleaveCountUB = bit_floor (
5025+ std::max (1u , std::min (AvailableTC / EstimatedVF, MaxInterleaveCount)));
5026+ unsigned InterleaveCountLB = bit_floor (std::max (
5027+ 1u , std::min (AvailableTC / (EstimatedVF * 2 ), MaxInterleaveCount)));
5028+ MaxInterleaveCount = InterleaveCountLB;
5029+
5030+ if (InterleaveCountUB != InterleaveCountLB) {
5031+ unsigned TailTripCountUB =
5032+ (AvailableTC % (EstimatedVF * InterleaveCountUB));
5033+ unsigned TailTripCountLB =
5034+ (AvailableTC % (EstimatedVF * InterleaveCountLB));
5035+ // If both produce same scalar tail, maximize the IC to do the same work
5036+ // in fewer vector loop iterations
5037+ if (TailTripCountUB == TailTripCountLB)
5038+ MaxInterleaveCount = InterleaveCountUB;
5039+ }
50405040 } else if (BestKnownTC) {
50415041 // At least one iteration must be scalar when this constraint holds. So the
50425042 // maximum available iterations for interleaving is one less.
0 commit comments