@@ -1509,7 +1509,10 @@ class LoopVectorizationCostModel {
15091509 // / Returns true if epilogue vectorization is considered profitable, and
15101510 // / false otherwise.
15111511 // / \p VF is the vectorization factor chosen for the original loop.
1512- bool isEpilogueVectorizationProfitable (const ElementCount VF) const ;
1512+ // / \p Multiplier is an aditional scaling factor applied to VF before
1513+ // / comparing to EpilogueVectorizationMinVF.
1514+ bool isEpilogueVectorizationProfitable (const ElementCount VF,
1515+ const unsigned Multiplier) const ;
15131516
15141517 // / Returns the execution time cost of an instruction for a given vector
15151518 // / width. Vector width of one means scalar.
@@ -4257,12 +4260,11 @@ getVScaleForTuning(const Loop *L, const TargetTransformInfo &TTI) {
42574260}
42584261
42594262bool LoopVectorizationPlanner::isMoreProfitable (
4260- const VectorizationFactor &A, const VectorizationFactor &B) const {
4263+ const VectorizationFactor &A, const VectorizationFactor &B,
4264+ const unsigned MaxTripCount) const {
42614265 InstructionCost CostA = A.Cost ;
42624266 InstructionCost CostB = B.Cost ;
42634267
4264- unsigned MaxTripCount = PSE.getSE ()->getSmallConstantMaxTripCount (OrigLoop);
4265-
42664268 // Improve estimate for the vector width if it is scalable.
42674269 unsigned EstimatedWidthA = A.Width .getKnownMinValue ();
42684270 unsigned EstimatedWidthB = B.Width .getKnownMinValue ();
@@ -4311,6 +4313,13 @@ bool LoopVectorizationPlanner::isMoreProfitable(
43114313 return CmpFn (RTCostA, RTCostB);
43124314}
43134315
4316+ bool LoopVectorizationPlanner::isMoreProfitable (
4317+ const VectorizationFactor &A, const VectorizationFactor &B) const {
4318+ const unsigned MaxTripCount =
4319+ PSE.getSE ()->getSmallConstantMaxTripCount (OrigLoop);
4320+ return LoopVectorizationPlanner::isMoreProfitable (A, B, MaxTripCount);
4321+ }
4322+
43144323void LoopVectorizationPlanner::emitInvalidCostRemarks (
43154324 OptimizationRemarkEmitter *ORE) {
43164325 using RecipeVFPair = std::pair<VPRecipeBase *, ElementCount>;
@@ -4620,7 +4629,7 @@ bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization(
46204629}
46214630
46224631bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable (
4623- const ElementCount VF) const {
4632+ const ElementCount VF, const unsigned Multiplier ) const {
46244633 // FIXME: We need a much better cost-model to take different parameters such
46254634 // as register pressure, code size increase and cost of extra branches into
46264635 // account. For now we apply a very crude heuristic and only consider loops
@@ -4635,9 +4644,6 @@ bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
46354644 if (TTI.getMaxInterleaveFactor (VF) <= 1 )
46364645 return false ;
46374646
4638- unsigned Multiplier = 1 ;
4639- if (VF.isScalable ())
4640- Multiplier = getVScaleForTuning (TheLoop, TTI).value_or (1 );
46414647 if ((Multiplier * VF.getKnownMinValue ()) >= EpilogueVectorizationMinVF)
46424648 return true ;
46434649 return false ;
@@ -4683,7 +4689,11 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
46834689 return Result;
46844690 }
46854691
4686- if (!CM.isEpilogueVectorizationProfitable (MainLoopVF)) {
4692+ unsigned Multiplier = IC;
4693+ if (MainLoopVF.isScalable ())
4694+ Multiplier = getVScaleForTuning (OrigLoop, TTI).value_or (1 );
4695+
4696+ if (!CM.isEpilogueVectorizationProfitable (MainLoopVF, Multiplier)) {
46874697 LLVM_DEBUG (dbgs () << " LEV: Epilogue vectorization is not profitable for "
46884698 " this loop\n " );
46894699 return Result;
@@ -4702,16 +4712,20 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
47024712 ScalarEvolution &SE = *PSE.getSE ();
47034713 Type *TCType = Legal->getWidestInductionType ();
47044714 const SCEV *RemainingIterations = nullptr ;
4715+ unsigned MaxTripCount = 0 ;
47054716 for (auto &NextVF : ProfitableVFs) {
47064717 // Skip candidate VFs without a corresponding VPlan.
47074718 if (!hasPlanWithVF (NextVF.Width ))
47084719 continue ;
47094720
4710- // Skip candidate VFs with widths >= the estimate runtime VF (scalable
4711- // vectors) or the VF of the main loop (fixed vectors).
4721+ // Skip candidate VFs with widths >= the (estimated) runtime VF (scalable
4722+ // vectors) or > the VF of the main loop (fixed vectors).
47124723 if ((!NextVF.Width .isScalable () && MainLoopVF.isScalable () &&
47134724 ElementCount::isKnownGE (NextVF.Width , EstimatedRuntimeVF)) ||
4714- ElementCount::isKnownGE (NextVF.Width , MainLoopVF))
4725+ (NextVF.Width .isScalable () &&
4726+ ElementCount::isKnownGE (NextVF.Width , MainLoopVF)) ||
4727+ (!NextVF.Width .isScalable () && !MainLoopVF.isScalable () &&
4728+ ElementCount::isKnownGT (NextVF.Width , MainLoopVF)))
47154729 continue ;
47164730
47174731 // If NextVF is greater than the number of remaining iterations, the
@@ -4725,6 +4739,14 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
47254739 " Trip count SCEV must be computable" );
47264740 RemainingIterations = SE.getURemExpr (
47274741 TC, SE.getConstant (TCType, MainLoopVF.getKnownMinValue () * IC));
4742+ const APInt MaxRemainingIterations =
4743+ SE.getUnsignedRangeMax (RemainingIterations);
4744+ // Guard against huge trip counts.
4745+ if (MaxRemainingIterations.getActiveBits () <= 32 ) {
4746+ MaxTripCount = MaxRemainingIterations.getZExtValue ();
4747+ LLVM_DEBUG (dbgs () << " LEV: Maximum Trip Count for Epilogue: "
4748+ << MaxTripCount << " \n " );
4749+ }
47284750 }
47294751 if (SE.isKnownPredicate (
47304752 CmpInst::ICMP_UGT,
@@ -4733,7 +4755,8 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
47334755 continue ;
47344756 }
47354757
4736- if (Result.Width .isScalar () || isMoreProfitable (NextVF, Result))
4758+ if (Result.Width .isScalar () ||
4759+ isMoreProfitable (NextVF, Result, MaxTripCount))
47374760 Result = NextVF;
47384761 }
47394762
0 commit comments