diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h index f80379b980bec..8f6a73d0a2dd8 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -535,13 +535,13 @@ class LoopVectorizationPlanner { /// Returns true if the per-lane cost of VectorizationFactor A is lower than /// that of B. bool isMoreProfitable(const VectorizationFactor &A, - const VectorizationFactor &B) const; + const VectorizationFactor &B, bool HasTail) const; /// Returns true if the per-lane cost of VectorizationFactor A is lower than /// that of B in the context of vectorizing a loop with known \p MaxTripCount. bool isMoreProfitable(const VectorizationFactor &A, const VectorizationFactor &B, - const unsigned MaxTripCount) const; + const unsigned MaxTripCount, bool HasTail) const; /// Determines if we have the infrastructure to vectorize the loop and its /// epilogue, assuming the main loop is vectorized by \p VF. diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index a4d546f698d5f..249e8fca4cf0a 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -4256,9 +4256,10 @@ static unsigned getEstimatedRuntimeVF(ElementCount VF, return EstimatedVF; } -bool LoopVectorizationPlanner::isMoreProfitable( - const VectorizationFactor &A, const VectorizationFactor &B, - const unsigned MaxTripCount) const { +bool LoopVectorizationPlanner::isMoreProfitable(const VectorizationFactor &A, + const VectorizationFactor &B, + const unsigned MaxTripCount, + bool HasTail) const { InstructionCost CostA = A.Cost; InstructionCost CostB = B.Cost; @@ -4296,9 +4297,9 @@ bool LoopVectorizationPlanner::isMoreProfitable( if (!MaxTripCount) return CmpFn(CostA * EstimatedWidthB, CostB * EstimatedWidthA); - auto GetCostForTC = [MaxTripCount, this](unsigned VF, - InstructionCost VectorCost, - InstructionCost ScalarCost) { + auto GetCostForTC = [MaxTripCount, HasTail](unsigned VF, + InstructionCost VectorCost, + InstructionCost ScalarCost) { // If the trip count is a known (possibly small) constant, the trip count // will be rounded up to an integer number of iterations under // FoldTailByMasking. The total cost in that case will be @@ -4307,9 +4308,10 @@ bool LoopVectorizationPlanner::isMoreProfitable( // some extra overheads, but for the purpose of comparing the costs of // different VFs we can use this to compare the total loop-body cost // expected after vectorization. - if (CM.foldTailByMasking()) - return VectorCost * divideCeil(MaxTripCount, VF); - return VectorCost * (MaxTripCount / VF) + ScalarCost * (MaxTripCount % VF); + if (HasTail) + return VectorCost * (MaxTripCount / VF) + + ScalarCost * (MaxTripCount % VF); + return VectorCost * divideCeil(MaxTripCount, VF); }; auto RTCostA = GetCostForTC(EstimatedWidthA, CostA, A.ScalarCost); @@ -4317,10 +4319,12 @@ bool LoopVectorizationPlanner::isMoreProfitable( return CmpFn(RTCostA, RTCostB); } -bool LoopVectorizationPlanner::isMoreProfitable( - const VectorizationFactor &A, const VectorizationFactor &B) const { +bool LoopVectorizationPlanner::isMoreProfitable(const VectorizationFactor &A, + const VectorizationFactor &B, + bool HasTail) const { const unsigned MaxTripCount = PSE.getSmallConstantMaxTripCount(); - return LoopVectorizationPlanner::isMoreProfitable(A, B, MaxTripCount); + return LoopVectorizationPlanner::isMoreProfitable(A, B, MaxTripCount, + HasTail); } void LoopVectorizationPlanner::emitInvalidCostRemarks( @@ -4609,7 +4613,7 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() { continue; } - if (isMoreProfitable(Candidate, ChosenFactor)) + if (isMoreProfitable(Candidate, ChosenFactor, P->hasScalarTail())) ChosenFactor = Candidate; } } @@ -4623,7 +4627,8 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() { } LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() && - !isMoreProfitable(ChosenFactor, ScalarCost)) dbgs() + !isMoreProfitable(ChosenFactor, ScalarCost, + !CM.foldTailByMasking())) dbgs() << "LV: Vectorization seems to be not beneficial, " << "but was forced by a user.\n"); return ChosenFactor; @@ -4789,7 +4794,7 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor( } if (Result.Width.isScalar() || - isMoreProfitable(NextVF, Result, MaxTripCount)) + isMoreProfitable(NextVF, Result, MaxTripCount, !CM.foldTailByMasking())) Result = NextVF; } @@ -7768,11 +7773,11 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() { InstructionCost Cost = cost(*P, VF); VectorizationFactor CurrentFactor(VF, Cost, ScalarCost); - if (isMoreProfitable(CurrentFactor, BestFactor)) + if (isMoreProfitable(CurrentFactor, BestFactor, P->hasScalarTail())) BestFactor = CurrentFactor; // If profitable add it to ProfitableVF list. - if (isMoreProfitable(CurrentFactor, ScalarFactor)) + if (isMoreProfitable(CurrentFactor, ScalarFactor, P->hasScalarTail())) ProfitableVFs.push_back(CurrentFactor); } } diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 7cdcb24e9760b..60c9b1cf4cdee 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -3790,6 +3790,13 @@ class VPlan { bool hasEarlyExit() const { return ExitBlocks.size() > 1 || ExitBlocks[0]->getNumPredecessors() > 1; } + + /// Returns true if the scalar tail may execute after the vector loop. Note + /// that this relies on unneeded branches to the scalar tail loop being + /// removed. + bool hasScalarTail() const { + return getScalarPreheader()->getNumPredecessors() != 0; + } }; #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)