@@ -10165,6 +10165,12 @@ static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) {
1016510165 }
1016610166}
1016710167
10168+ // / For loops with uncountable early exits, find the cost of doing work when
10169+ // / exiting the loop early, such as calculating the final exit values of
10170+ // / variables used outside the loop.
10171+ // / TODO: This is currently overly pessimistic because the loop may not take
10172+ // / the early exit, but better to keep this conservative for now. In future,
10173+ // / it might be possible to relax this by using branch probabilities.
1016810174static InstructionCost calculateEarlyExitCost (LoopVectorizationCostModel &CM,
1016910175 VPlan &Plan, ElementCount VF) {
1017010176 InstructionCost Cost = 0 ;
@@ -10173,37 +10179,44 @@ static InstructionCost calculateEarlyExitCost(LoopVectorizationCostModel &CM,
1017310179 LLVM_DEBUG (
1017410180 dbgs () << " Calculating cost of work in vector early exit block:\n " );
1017510181 for (auto *ExitVPBB : Plan.getExitBlocks ()) {
10176- for (auto *PredVPBB : ExitVPBB->getPredecessors ())
10182+ for (auto *PredVPBB : ExitVPBB->getPredecessors ()) {
10183+ // If the predecessor is not the middle.block, then it must be the
10184+ // vector.early.exit block, which may contain work to calculate the exit
10185+ // values of variables used outside the loop.
1017710186 if (PredVPBB != Plan.getMiddleBlock ())
1017810187 for (auto &R : *(cast<VPBasicBlock>(PredVPBB)))
1017910188 Cost += R.cost (VF, CostCtx);
10189+ }
1018010190 }
1018110191 return Cost;
1018210192}
1018310193
10194+ // / This function determines whether or not it's still profitable to vectorize
10195+ // / the loop given the extra work we have to do outside of the loop:
10196+ // / 1. Perform the runtime checks before entering the loop to ensure it's safe
10197+ // / to vectorize.
10198+ // / 2. In the case of loops with uncountable early exits, we may have to do
10199+ // / extra work when exiting the loop early, such as calculating the final
10200+ // / exit values of variables used outside the loop.
1018410201static bool isOutsideLoopWorkProfitable (GeneratedRTChecks &Checks,
10185- VectorizationFactor &VF, Loop *L,
10186- const TargetTransformInfo &TTI ,
10202+ VectorizationFactor &VF,
10203+ LoopVectorizationCostModel &CM ,
1018710204 PredicatedScalarEvolution &PSE,
10188- ScalarEpilogueLowering SEL,
10189- std::optional<unsigned > VScale,
10190- InstructionCost EarlyExitCost) {
10191- InstructionCost CheckCost = Checks.getCost ();
10192- if (!CheckCost.isValid () && !EarlyExitCost.isValid ())
10205+ VPlan &Plan,
10206+ ScalarEpilogueLowering SEL) {
10207+ InstructionCost TotalCost = Checks.getCost ();
10208+ if (!TotalCost.isValid ())
1019310209 return false ;
1019410210
10195- InstructionCost TotalCost = 0 ;
10196- if (CheckCost.isValid ())
10197- TotalCost += CheckCost;
10198-
1019910211 // Add on the cost of work required in the vector early exit block, if one
1020010212 // exists.
10201- if (EarlyExitCost. isValid ())
10202- TotalCost += EarlyExitCost ;
10213+ if (CM. Legal -> hasUncountableEarlyExit ())
10214+ TotalCost += calculateEarlyExitCost (CM, Plan, VF. Width ) ;
1020310215
1020410216 // When interleaving only scalar and vector cost will be equal, which in turn
1020510217 // would lead to a divide by 0. Fall back to hard threshold.
1020610218 if (VF.Width .isScalar ()) {
10219+ // TODO: Should we rename VectorizeMemoryCheckThreshold?
1020710220 if (TotalCost > VectorizeMemoryCheckThreshold) {
1020810221 LLVM_DEBUG (
1020910222 dbgs ()
@@ -10229,7 +10242,9 @@ static bool isOutsideLoopWorkProfitable(GeneratedRTChecks &Checks,
1022910242 // The total cost of the vector loop is
1023010243 // RtC + VecC * (TC / VF) + EpiC
1023110244 // where
10232- // * RtC is the cost of the generated runtime checks
10245+ // * RtC is the cost of the generated runtime checks plus the cost of
10246+ // performing any additional work in the vector.early.exit block for loops
10247+ // with uncountable early exits.
1023310248 // * VecC is the cost of a single vector iteration.
1023410249 // * TC is the actual trip count of the loop
1023510250 // * VF is the vectorization factor
@@ -10246,7 +10261,7 @@ static bool isOutsideLoopWorkProfitable(GeneratedRTChecks &Checks,
1024610261 // For now we assume the epilogue cost EpiC = 0 for simplicity. Note that
1024710262 // the computations are performed on doubles, not integers and the result
1024810263 // is rounded up, hence we get an upper estimate of the TC.
10249- unsigned IntVF = getEstimatedRuntimeVF (VF.Width , VScale );
10264+ unsigned IntVF = getEstimatedRuntimeVF (VF.Width , CM. getVScaleForTuning () );
1025010265 uint64_t RtC = *TotalCost.getValue ();
1025110266 uint64_t Div = ScalarC * IntVF - *VF.Cost .getValue ();
1025210267 uint64_t MinTC1 = Div == 0 ? 0 : divideCeil (RtC * IntVF, Div);
@@ -10274,7 +10289,7 @@ static bool isOutsideLoopWorkProfitable(GeneratedRTChecks &Checks,
1027410289
1027510290 // Skip vectorization if the expected trip count is less than the minimum
1027610291 // required trip count.
10277- if (auto ExpectedTC = getSmallBestKnownTC (PSE, L )) {
10292+ if (auto ExpectedTC = getSmallBestKnownTC (PSE, CM. TheLoop )) {
1027810293 if (ElementCount::isKnownLT (ElementCount::getFixed (*ExpectedTC),
1027910294 VF.MinProfitableTripCount )) {
1028010295 LLVM_DEBUG (dbgs () << " LV: Vectorization is not beneficial: expected "
@@ -10671,17 +10686,12 @@ bool LoopVectorizePass::processLoop(Loop *L) {
1067110686 if (VF.Width .isVector () || SelectedIC > 1 )
1067210687 Checks.create (L, *LVL.getLAI (), PSE.getPredicate (), VF.Width , SelectedIC);
1067310688
10674- InstructionCost EarlyExitCost = InstructionCost::getInvalid ();
10675- if (VF.Width .isVector () && LVL.hasUncountableEarlyExit ())
10676- EarlyExitCost =
10677- calculateEarlyExitCost (CM, LVP.getPlanFor (VF.Width ), VF.Width );
10678-
1067910689 // Check if it is profitable to vectorize with runtime checks.
1068010690 bool ForceVectorization =
1068110691 Hints.getForce () == LoopVectorizeHints::FK_Enabled;
1068210692 if (!ForceVectorization &&
10683- !isOutsideLoopWorkProfitable (Checks, VF, L, *TTI, PSE, SEL ,
10684- CM. getVScaleForTuning ( ), EarlyExitCost )) {
10693+ !isOutsideLoopWorkProfitable (Checks, VF, CM, PSE,
10694+ LVP. getPlanFor (VF. Width ), SEL )) {
1068510695 ORE->emit ([&]() {
1068610696 return OptimizationRemarkAnalysisAliasing (
1068710697 DEBUG_TYPE, " CantReorderMemOps" , L->getStartLoc (),
0 commit comments