@@ -1519,7 +1519,7 @@ class LoopVectorizationCostModel {
15191519 // / \p Multiplier is an aditional scaling factor applied to VF before
15201520 // / comparing to EpilogueVectorizationMinVF.
15211521 bool isEpilogueVectorizationProfitable (const ElementCount VF,
1522- const unsigned Multiplier ) const ;
1522+ const unsigned IC ) const ;
15231523
15241524 // / Returns the execution time cost of an instruction for a given vector
15251525 // / width. Vector width of one means scalar.
@@ -4291,6 +4291,21 @@ getVScaleForTuning(const Loop *L, const TargetTransformInfo &TTI) {
42914291 return TTI.getVScaleForTuning ();
42924292}
42934293
4294+ // / This functions attempts to return a value that represents the vectorization
4295+ // / factor at runtime. For fixed-width VFs we know this precisely at compile
4296+ // / time, but for scalable VFs we calculate it based on an estimate of the
4297+ // / vscale value.
4298+ static unsigned getEstimatedRuntimeVF (const Loop *L,
4299+ const TargetTransformInfo &TTI,
4300+ ElementCount VF) {
4301+ unsigned EstimatedVF = VF.getKnownMinValue ();
4302+ if (VF.isScalable ())
4303+ if (std::optional<unsigned > VScale = getVScaleForTuning (L, TTI))
4304+ EstimatedVF *= *VScale;
4305+ assert (EstimatedVF >= 1 && " Estimated VF shouldn't be less than 1" );
4306+ return EstimatedVF;
4307+ }
4308+
42944309bool LoopVectorizationPlanner::isMoreProfitable (
42954310 const VectorizationFactor &A, const VectorizationFactor &B,
42964311 const unsigned MaxTripCount) const {
@@ -4593,17 +4608,13 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
45934608 InstructionCost C = CM.expectedCost (VF);
45944609 VectorizationFactor Candidate (VF, C, ScalarCost.ScalarCost );
45954610
4596- unsigned AssumedMinimumVscale =
4597- getVScaleForTuning (OrigLoop, TTI).value_or (1 );
4598- unsigned Width =
4599- Candidate.Width .isScalable ()
4600- ? Candidate.Width .getKnownMinValue () * AssumedMinimumVscale
4601- : Candidate.Width .getFixedValue ();
4611+ unsigned Width = getEstimatedRuntimeVF (OrigLoop, TTI, Candidate.Width );
46024612 LLVM_DEBUG (dbgs () << " LV: Vector loop of width " << VF
46034613 << " costs: " << (Candidate.Cost / Width));
46044614 if (VF.isScalable ())
46054615 LLVM_DEBUG (dbgs () << " (assuming a minimum vscale of "
4606- << AssumedMinimumVscale << " )" );
4616+ << getVScaleForTuning (OrigLoop, TTI).value_or (1 )
4617+ << " )" );
46074618 LLVM_DEBUG (dbgs () << " .\n " );
46084619
46094620 if (!ForceVectorization && !willGenerateVectors (*P, VF, TTI)) {
@@ -4669,7 +4680,7 @@ bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization(
46694680}
46704681
46714682bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable (
4672- const ElementCount VF, const unsigned Multiplier ) const {
4683+ const ElementCount VF, const unsigned IC ) const {
46734684 // FIXME: We need a much better cost-model to take different parameters such
46744685 // as register pressure, code size increase and cost of extra branches into
46754686 // account. For now we apply a very crude heuristic and only consider loops
@@ -4684,9 +4695,13 @@ bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
46844695 if (TTI.getMaxInterleaveFactor (VF) <= 1 )
46854696 return false ;
46864697
4687- if ((Multiplier * VF.getKnownMinValue ()) >= EpilogueVectorizationMinVF)
4688- return true ;
4689- return false ;
4698+ // TODO: PR #108190 introduced a discrepancy between fixed-width and scalable
4699+ // VFs when deciding profitability.
4700+ // See related "TODO: extend to support scalable VFs." in
4701+ // selectEpilogueVectorizationFactor.
4702+ unsigned Multiplier = VF.isFixed () ? IC : 1 ;
4703+ return getEstimatedRuntimeVF (TheLoop, TTI, VF * Multiplier) >=
4704+ EpilogueVectorizationMinVF;
46904705}
46914706
46924707VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor (
@@ -4729,11 +4744,7 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
47294744 return Result;
47304745 }
47314746
4732- unsigned Multiplier = IC;
4733- if (MainLoopVF.isScalable ())
4734- Multiplier = getVScaleForTuning (OrigLoop, TTI).value_or (1 );
4735-
4736- if (!CM.isEpilogueVectorizationProfitable (MainLoopVF, Multiplier)) {
4747+ if (!CM.isEpilogueVectorizationProfitable (MainLoopVF, IC)) {
47374748 LLVM_DEBUG (dbgs () << " LEV: Epilogue vectorization is not profitable for "
47384749 " this loop\n " );
47394750 return Result;
@@ -4742,12 +4753,8 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
47424753 // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know
47434754 // the main loop handles 8 lanes per iteration. We could still benefit from
47444755 // vectorizing the epilogue loop with VF=4.
4745- ElementCount EstimatedRuntimeVF = MainLoopVF;
4746- if (MainLoopVF.isScalable ()) {
4747- EstimatedRuntimeVF = ElementCount::getFixed (MainLoopVF.getKnownMinValue ());
4748- if (std::optional<unsigned > VScale = getVScaleForTuning (OrigLoop, TTI))
4749- EstimatedRuntimeVF *= *VScale;
4750- }
4756+ ElementCount EstimatedRuntimeVF =
4757+ ElementCount::getFixed (getEstimatedRuntimeVF (OrigLoop, TTI, MainLoopVF));
47514758
47524759 ScalarEvolution &SE = *PSE.getSE ();
47534760 Type *TCType = Legal->getWidestInductionType ();
@@ -4987,13 +4994,7 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
49874994 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
49884995 }
49894996
4990- unsigned EstimatedVF = VF.getKnownMinValue ();
4991- if (VF.isScalable ()) {
4992- if (std::optional<unsigned > VScale = getVScaleForTuning (TheLoop, TTI))
4993- EstimatedVF *= *VScale;
4994- }
4995- assert (EstimatedVF >= 1 && " Estimated VF shouldn't be less than 1" );
4996-
4997+ unsigned EstimatedVF = getEstimatedRuntimeVF (TheLoop, TTI, VF);
49974998 unsigned KnownTC = PSE.getSE ()->getSmallConstantTripCount (TheLoop);
49984999 if (KnownTC > 0 ) {
49995000 // At least one iteration must be scalar when this constraint holds. So the
@@ -9797,8 +9798,8 @@ static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) {
97979798}
97989799
97999800static bool areRuntimeChecksProfitable (GeneratedRTChecks &Checks,
9800- VectorizationFactor &VF,
9801- std::optional< unsigned > VScale, Loop *L ,
9801+ VectorizationFactor &VF, Loop *L,
9802+ const TargetTransformInfo &TTI ,
98029803 PredicatedScalarEvolution &PSE,
98039804 ScalarEpilogueLowering SEL) {
98049805 InstructionCost CheckCost = Checks.getCost ();
@@ -9850,13 +9851,7 @@ static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
98509851 // For now we assume the epilogue cost EpiC = 0 for simplicity. Note that
98519852 // the computations are performed on doubles, not integers and the result
98529853 // is rounded up, hence we get an upper estimate of the TC.
9853- unsigned IntVF = VF.Width .getKnownMinValue ();
9854- if (VF.Width .isScalable ()) {
9855- unsigned AssumedMinimumVscale = 1 ;
9856- if (VScale)
9857- AssumedMinimumVscale = *VScale;
9858- IntVF *= AssumedMinimumVscale;
9859- }
9854+ unsigned IntVF = getEstimatedRuntimeVF (L, TTI, VF.Width );
98609855 uint64_t RtC = *CheckCost.getValue ();
98619856 uint64_t Div = ScalarC * IntVF - *VF.Cost .getValue ();
98629857 uint64_t MinTC1 = Div == 0 ? 0 : divideCeil (RtC * IntVF, Div);
@@ -10105,8 +10100,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
1010510100 bool ForceVectorization =
1010610101 Hints.getForce () == LoopVectorizeHints::FK_Enabled;
1010710102 if (!ForceVectorization &&
10108- !areRuntimeChecksProfitable (Checks, VF, getVScaleForTuning (L, *TTI), L,
10109- PSE, SEL)) {
10103+ !areRuntimeChecksProfitable (Checks, VF, L, *TTI, PSE, SEL)) {
1011010104 ORE->emit ([&]() {
1011110105 return OptimizationRemarkAnalysisAliasing (
1011210106 DEBUG_TYPE, " CantReorderMemOps" , L->getStartLoc (),
0 commit comments