@@ -1516,7 +1516,10 @@ class LoopVectorizationCostModel {
15161516 // / Returns true if epilogue vectorization is considered profitable, and
15171517 // / false otherwise.
15181518 // / \p VF is the vectorization factor chosen for the original loop.
1519- bool isEpilogueVectorizationProfitable (const ElementCount VF) const ;
1519+ // / \p Multiplier is an aditional scaling factor applied to VF before
1520+ // / comparing to EpilogueVectorizationMinVF.
1521+ bool isEpilogueVectorizationProfitable (const ElementCount VF,
1522+ const unsigned Multiplier) const ;
15201523
15211524 // / Returns the execution time cost of an instruction for a given vector
15221525 // / width. Vector width of one means scalar.
@@ -4289,12 +4292,11 @@ getVScaleForTuning(const Loop *L, const TargetTransformInfo &TTI) {
42894292}
42904293
42914294bool LoopVectorizationPlanner::isMoreProfitable (
4292- const VectorizationFactor &A, const VectorizationFactor &B) const {
4295+ const VectorizationFactor &A, const VectorizationFactor &B,
4296+ const unsigned MaxTripCount) const {
42934297 InstructionCost CostA = A.Cost ;
42944298 InstructionCost CostB = B.Cost ;
42954299
4296- unsigned MaxTripCount = PSE.getSmallConstantMaxTripCount ();
4297-
42984300 // Improve estimate for the vector width if it is scalable.
42994301 unsigned EstimatedWidthA = A.Width .getKnownMinValue ();
43004302 unsigned EstimatedWidthB = B.Width .getKnownMinValue ();
@@ -4343,6 +4345,12 @@ bool LoopVectorizationPlanner::isMoreProfitable(
43434345 return CmpFn (RTCostA, RTCostB);
43444346}
43454347
4348+ bool LoopVectorizationPlanner::isMoreProfitable (
4349+ const VectorizationFactor &A, const VectorizationFactor &B) const {
4350+ const unsigned MaxTripCount = PSE.getSmallConstantMaxTripCount ();
4351+ return LoopVectorizationPlanner::isMoreProfitable (A, B, MaxTripCount);
4352+ }
4353+
43464354void LoopVectorizationPlanner::emitInvalidCostRemarks (
43474355 OptimizationRemarkEmitter *ORE) {
43484356 using RecipeVFPair = std::pair<VPRecipeBase *, ElementCount>;
@@ -4661,7 +4669,7 @@ bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization(
46614669}
46624670
46634671bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable (
4664- const ElementCount VF) const {
4672+ const ElementCount VF, const unsigned Multiplier ) const {
46654673 // FIXME: We need a much better cost-model to take different parameters such
46664674 // as register pressure, code size increase and cost of extra branches into
46674675 // account. For now we apply a very crude heuristic and only consider loops
@@ -4676,9 +4684,6 @@ bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
46764684 if (TTI.getMaxInterleaveFactor (VF) <= 1 )
46774685 return false ;
46784686
4679- unsigned Multiplier = 1 ;
4680- if (VF.isScalable ())
4681- Multiplier = getVScaleForTuning (TheLoop, TTI).value_or (1 );
46824687 if ((Multiplier * VF.getKnownMinValue ()) >= EpilogueVectorizationMinVF)
46834688 return true ;
46844689 return false ;
@@ -4724,7 +4729,11 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
47244729 return Result;
47254730 }
47264731
4727- if (!CM.isEpilogueVectorizationProfitable (MainLoopVF)) {
4732+ unsigned Multiplier = IC;
4733+ if (MainLoopVF.isScalable ())
4734+ Multiplier = getVScaleForTuning (OrigLoop, TTI).value_or (1 );
4735+
4736+ if (!CM.isEpilogueVectorizationProfitable (MainLoopVF, Multiplier)) {
47284737 LLVM_DEBUG (dbgs () << " LEV: Epilogue vectorization is not profitable for "
47294738 " this loop\n " );
47304739 return Result;
@@ -4743,16 +4752,20 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
47434752 ScalarEvolution &SE = *PSE.getSE ();
47444753 Type *TCType = Legal->getWidestInductionType ();
47454754 const SCEV *RemainingIterations = nullptr ;
4755+ unsigned MaxTripCount = 0 ;
47464756 for (auto &NextVF : ProfitableVFs) {
47474757 // Skip candidate VFs without a corresponding VPlan.
47484758 if (!hasPlanWithVF (NextVF.Width ))
47494759 continue ;
47504760
4751- // Skip candidate VFs with widths >= the estimate runtime VF (scalable
4752- // vectors) or the VF of the main loop (fixed vectors).
4761+ // Skip candidate VFs with widths >= the (estimated) runtime VF (scalable
4762+ // vectors) or > the VF of the main loop (fixed vectors).
47534763 if ((!NextVF.Width .isScalable () && MainLoopVF.isScalable () &&
47544764 ElementCount::isKnownGE (NextVF.Width , EstimatedRuntimeVF)) ||
4755- ElementCount::isKnownGE (NextVF.Width , MainLoopVF))
4765+ (NextVF.Width .isScalable () &&
4766+ ElementCount::isKnownGE (NextVF.Width , MainLoopVF)) ||
4767+ (!NextVF.Width .isScalable () && !MainLoopVF.isScalable () &&
4768+ ElementCount::isKnownGT (NextVF.Width , MainLoopVF)))
47564769 continue ;
47574770
47584771 // If NextVF is greater than the number of remaining iterations, the
@@ -4766,6 +4779,14 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
47664779 " Trip count SCEV must be computable" );
47674780 RemainingIterations = SE.getURemExpr (
47684781 TC, SE.getConstant (TCType, MainLoopVF.getKnownMinValue () * IC));
4782+ MaxTripCount = MainLoopVF.getKnownMinValue () * IC - 1 ;
4783+ if (SE.isKnownPredicate (CmpInst::ICMP_ULT, RemainingIterations,
4784+ SE.getConstant (TCType, MaxTripCount))) {
4785+ MaxTripCount =
4786+ SE.getUnsignedRangeMax (RemainingIterations).getZExtValue ();
4787+ }
4788+ LLVM_DEBUG (dbgs () << " LEV: Maximum Trip Count for Epilogue: "
4789+ << MaxTripCount << " \n " );
47694790 }
47704791 if (SE.isKnownPredicate (
47714792 CmpInst::ICMP_UGT,
@@ -4774,7 +4795,8 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
47744795 continue ;
47754796 }
47764797
4777- if (Result.Width .isScalar () || isMoreProfitable (NextVF, Result))
4798+ if (Result.Width .isScalar () ||
4799+ isMoreProfitable (NextVF, Result, MaxTripCount))
47784800 Result = NextVF;
47794801 }
47804802
0 commit comments