@@ -10171,6 +10171,12 @@ static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) {
1017110171 }
1017210172}
1017310173
10174+ // / For loops with uncountable early exits, find the cost of doing work when
10175+ // / exiting the loop early, such as calculating the final exit values of
10176+ // / variables used outside the loop.
10177+ // / TODO: This is currently overly pessimistic because the loop may not take
10178+ // / the early exit, but better to keep this conservative for now. In future,
10179+ // / it might be possible to relax this by using branch probabilities.
1017410180static InstructionCost calculateEarlyExitCost (LoopVectorizationCostModel &CM,
1017510181 VPlan &Plan, ElementCount VF) {
1017610182 InstructionCost Cost = 0 ;
@@ -10179,37 +10185,44 @@ static InstructionCost calculateEarlyExitCost(LoopVectorizationCostModel &CM,
1017910185 LLVM_DEBUG (
1018010186 dbgs () << " Calculating cost of work in vector early exit block:\n " );
1018110187 for (auto *ExitVPBB : Plan.getExitBlocks ()) {
10182- for (auto *PredVPBB : ExitVPBB->getPredecessors ())
10188+ for (auto *PredVPBB : ExitVPBB->getPredecessors ()) {
10189+ // If the predecessor is not the middle.block, then it must be the
10190+ // vector.early.exit block, which may contain work to calculate the exit
10191+ // values of variables used outside the loop.
1018310192 if (PredVPBB != Plan.getMiddleBlock ())
1018410193 for (auto &R : *(cast<VPBasicBlock>(PredVPBB)))
1018510194 Cost += R.cost (VF, CostCtx);
10195+ }
1018610196 }
1018710197 return Cost;
1018810198}
1018910199
10200+ // / This function determines whether or not it's still profitable to vectorize
10201+ // / the loop given the extra work we have to do outside of the loop:
10202+ // / 1. Perform the runtime checks before entering the loop to ensure it's safe
10203+ // / to vectorize.
10204+ // / 2. In the case of loops with uncountable early exits, we may have to do
10205+ // / extra work when exiting the loop early, such as calculating the final
10206+ // / exit values of variables used outside the loop.
1019010207static bool isOutsideLoopWorkProfitable (GeneratedRTChecks &Checks,
10191- VectorizationFactor &VF, Loop *L,
10192- const TargetTransformInfo &TTI ,
10208+ VectorizationFactor &VF,
10209+ LoopVectorizationCostModel &CM ,
1019310210 PredicatedScalarEvolution &PSE,
10194- ScalarEpilogueLowering SEL,
10195- std::optional<unsigned > VScale,
10196- InstructionCost EarlyExitCost) {
10197- InstructionCost CheckCost = Checks.getCost ();
10198- if (!CheckCost.isValid () && !EarlyExitCost.isValid ())
10211+ VPlan &Plan,
10212+ ScalarEpilogueLowering SEL) {
10213+ InstructionCost TotalCost = Checks.getCost ();
10214+ if (!TotalCost.isValid ())
1019910215 return false ;
1020010216
10201- InstructionCost TotalCost = 0 ;
10202- if (CheckCost.isValid ())
10203- TotalCost += CheckCost;
10204-
1020510217 // Add on the cost of work required in the vector early exit block, if one
1020610218 // exists.
10207- if (EarlyExitCost. isValid ())
10208- TotalCost += EarlyExitCost ;
10219+ if (CM. Legal -> hasUncountableEarlyExit ())
10220+ TotalCost += calculateEarlyExitCost (CM, Plan, VF. Width ) ;
1020910221
1021010222 // When interleaving only scalar and vector cost will be equal, which in turn
1021110223 // would lead to a divide by 0. Fall back to hard threshold.
1021210224 if (VF.Width .isScalar ()) {
10225+ // TODO: Should we rename VectorizeMemoryCheckThreshold?
1021310226 if (TotalCost > VectorizeMemoryCheckThreshold) {
1021410227 LLVM_DEBUG (
1021510228 dbgs ()
@@ -10236,7 +10249,9 @@ static bool isOutsideLoopWorkProfitable(GeneratedRTChecks &Checks,
1023610249 // The total cost of the vector loop is
1023710250 // RtC + VecC * (TC / VF) + EpiC
1023810251 // where
10239- // * RtC is the cost of the generated runtime checks
10252+ // * RtC is the cost of the generated runtime checks plus the cost of
10253+ // performing any additional work in the vector.early.exit block for loops
10254+ // with uncountable early exits.
1024010255 // * VecC is the cost of a single vector iteration.
1024110256 // * TC is the actual trip count of the loop
1024210257 // * VF is the vectorization factor
@@ -10253,7 +10268,7 @@ static bool isOutsideLoopWorkProfitable(GeneratedRTChecks &Checks,
1025310268 // For now we assume the epilogue cost EpiC = 0 for simplicity. Note that
1025410269 // the computations are performed on doubles, not integers and the result
1025510270 // is rounded up, hence we get an upper estimate of the TC.
10256- unsigned IntVF = getEstimatedRuntimeVF (VF.Width , VScale );
10271+ unsigned IntVF = getEstimatedRuntimeVF (VF.Width , CM. getVScaleForTuning () );
1025710272 uint64_t RtC = *TotalCost.getValue ();
1025810273 uint64_t Div = ScalarC * IntVF - *VF.Cost .getValue ();
1025910274 uint64_t MinTC1 = Div == 0 ? 0 : divideCeil (RtC * IntVF, Div);
@@ -10281,7 +10296,7 @@ static bool isOutsideLoopWorkProfitable(GeneratedRTChecks &Checks,
1028110296
1028210297 // Skip vectorization if the expected trip count is less than the minimum
1028310298 // required trip count.
10284- if (auto ExpectedTC = getSmallBestKnownTC (PSE, L )) {
10299+ if (auto ExpectedTC = getSmallBestKnownTC (PSE, CM. TheLoop )) {
1028510300 if (ElementCount::isKnownLT (ElementCount::getFixed (*ExpectedTC),
1028610301 VF.MinProfitableTripCount )) {
1028710302 LLVM_DEBUG (dbgs () << " LV: Vectorization is not beneficial: expected "
@@ -10678,17 +10693,12 @@ bool LoopVectorizePass::processLoop(Loop *L) {
1067810693 if (VF.Width .isVector () || SelectedIC > 1 )
1067910694 Checks.create (L, *LVL.getLAI (), PSE.getPredicate (), VF.Width , SelectedIC);
1068010695
10681- InstructionCost EarlyExitCost = InstructionCost::getInvalid ();
10682- if (VF.Width .isVector () && LVL.hasUncountableEarlyExit ())
10683- EarlyExitCost =
10684- calculateEarlyExitCost (CM, LVP.getPlanFor (VF.Width ), VF.Width );
10685-
1068610696 // Check if it is profitable to vectorize with runtime checks.
1068710697 bool ForceVectorization =
1068810698 Hints.getForce () == LoopVectorizeHints::FK_Enabled;
1068910699 if (!ForceVectorization &&
10690- !isOutsideLoopWorkProfitable (Checks, VF, L, *TTI, PSE, SEL ,
10691- CM. getVScaleForTuning ( ), EarlyExitCost )) {
10700+ !isOutsideLoopWorkProfitable (Checks, VF, CM, PSE,
10701+ LVP. getPlanFor (VF. Width ), SEL )) {
1069210702 ORE->emit ([&]() {
1069310703 return OptimizationRemarkAnalysisAliasing (
1069410704 DEBUG_TYPE, " CantReorderMemOps" , L->getStartLoc (),
0 commit comments