@@ -1520,7 +1520,7 @@ class LoopVectorizationCostModel {
15201520 // / \p Multiplier is an aditional scaling factor applied to VF before
15211521 // / comparing to EpilogueVectorizationMinVF.
15221522 bool isEpilogueVectorizationProfitable (const ElementCount VF,
1523- const unsigned Multiplier ) const ;
1523+ const unsigned IC ) const ;
15241524
15251525 // / Returns the execution time cost of an instruction for a given vector
15261526 // / width. Vector width of one means scalar.
@@ -4292,6 +4292,21 @@ getVScaleForTuning(const Loop *L, const TargetTransformInfo &TTI) {
42924292 return TTI.getVScaleForTuning ();
42934293}
42944294
4295+ // / This function attempts to return a value that represents the vectorization
4296+ // / factor at runtime. For fixed-width VFs we know this precisely at compile
4297+ // / time, but for scalable VFs we calculate it based on an estimate of the
4298+ // / vscale value.
4299+ static unsigned getEstimatedRuntimeVF (const Loop *L,
4300+ const TargetTransformInfo &TTI,
4301+ ElementCount VF) {
4302+ unsigned EstimatedVF = VF.getKnownMinValue ();
4303+ if (VF.isScalable ())
4304+ if (std::optional<unsigned > VScale = getVScaleForTuning (L, TTI))
4305+ EstimatedVF *= *VScale;
4306+ assert (EstimatedVF >= 1 && " Estimated VF shouldn't be less than 1" );
4307+ return EstimatedVF;
4308+ }
4309+
42954310bool LoopVectorizationPlanner::isMoreProfitable (
42964311 const VectorizationFactor &A, const VectorizationFactor &B,
42974312 const unsigned MaxTripCount) const {
@@ -4594,17 +4609,13 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
45944609 InstructionCost C = CM.expectedCost (VF);
45954610 VectorizationFactor Candidate (VF, C, ScalarCost.ScalarCost );
45964611
4597- unsigned AssumedMinimumVscale =
4598- getVScaleForTuning (OrigLoop, TTI).value_or (1 );
4599- unsigned Width =
4600- Candidate.Width .isScalable ()
4601- ? Candidate.Width .getKnownMinValue () * AssumedMinimumVscale
4602- : Candidate.Width .getFixedValue ();
4612+ unsigned Width = getEstimatedRuntimeVF (OrigLoop, TTI, Candidate.Width );
46034613 LLVM_DEBUG (dbgs () << " LV: Vector loop of width " << VF
46044614 << " costs: " << (Candidate.Cost / Width));
46054615 if (VF.isScalable ())
46064616 LLVM_DEBUG (dbgs () << " (assuming a minimum vscale of "
4607- << AssumedMinimumVscale << " )" );
4617+ << getVScaleForTuning (OrigLoop, TTI).value_or (1 )
4618+ << " )" );
46084619 LLVM_DEBUG (dbgs () << " .\n " );
46094620
46104621 if (!ForceVectorization && !willGenerateVectors (*P, VF, TTI)) {
@@ -4670,7 +4681,7 @@ bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization(
46704681}
46714682
46724683bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable (
4673- const ElementCount VF, const unsigned Multiplier ) const {
4684+ const ElementCount VF, const unsigned IC ) const {
46744685 // FIXME: We need a much better cost-model to take different parameters such
46754686 // as register pressure, code size increase and cost of extra branches into
46764687 // account. For now we apply a very crude heuristic and only consider loops
@@ -4685,9 +4696,13 @@ bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
46854696 if (TTI.getMaxInterleaveFactor (VF) <= 1 )
46864697 return false ;
46874698
4688- if ((Multiplier * VF.getKnownMinValue ()) >= EpilogueVectorizationMinVF)
4689- return true ;
4690- return false ;
4699+ // TODO: PR #108190 introduced a discrepancy between fixed-width and scalable
4700+ // VFs when deciding profitability.
4701+ // See related "TODO: extend to support scalable VFs." in
4702+ // selectEpilogueVectorizationFactor.
4703+ unsigned Multiplier = VF.isFixed () ? IC : 1 ;
4704+ return getEstimatedRuntimeVF (TheLoop, TTI, VF * Multiplier) >=
4705+ EpilogueVectorizationMinVF;
46914706}
46924707
46934708VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor (
@@ -4730,11 +4745,7 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
47304745 return Result;
47314746 }
47324747
4733- unsigned Multiplier = IC;
4734- if (MainLoopVF.isScalable ())
4735- Multiplier = getVScaleForTuning (OrigLoop, TTI).value_or (1 );
4736-
4737- if (!CM.isEpilogueVectorizationProfitable (MainLoopVF, Multiplier)) {
4748+ if (!CM.isEpilogueVectorizationProfitable (MainLoopVF, IC)) {
47384749 LLVM_DEBUG (dbgs () << " LEV: Epilogue vectorization is not profitable for "
47394750 " this loop\n " );
47404751 return Result;
@@ -4743,12 +4754,8 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
47434754 // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know
47444755 // the main loop handles 8 lanes per iteration. We could still benefit from
47454756 // vectorizing the epilogue loop with VF=4.
4746- ElementCount EstimatedRuntimeVF = MainLoopVF;
4747- if (MainLoopVF.isScalable ()) {
4748- EstimatedRuntimeVF = ElementCount::getFixed (MainLoopVF.getKnownMinValue ());
4749- if (std::optional<unsigned > VScale = getVScaleForTuning (OrigLoop, TTI))
4750- EstimatedRuntimeVF *= *VScale;
4751- }
4757+ ElementCount EstimatedRuntimeVF =
4758+ ElementCount::getFixed (getEstimatedRuntimeVF (OrigLoop, TTI, MainLoopVF));
47524759
47534760 ScalarEvolution &SE = *PSE.getSE ();
47544761 Type *TCType = Legal->getWidestInductionType ();
@@ -4988,13 +4995,7 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
49884995 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
49894996 }
49904997
4991- unsigned EstimatedVF = VF.getKnownMinValue ();
4992- if (VF.isScalable ()) {
4993- if (std::optional<unsigned > VScale = getVScaleForTuning (TheLoop, TTI))
4994- EstimatedVF *= *VScale;
4995- }
4996- assert (EstimatedVF >= 1 && " Estimated VF shouldn't be less than 1" );
4997-
4998+ unsigned EstimatedVF = getEstimatedRuntimeVF (TheLoop, TTI, VF);
49984999 unsigned KnownTC = PSE.getSE ()->getSmallConstantTripCount (TheLoop);
49995000 if (KnownTC > 0 ) {
50005001 // At least one iteration must be scalar when this constraint holds. So the
@@ -7426,10 +7427,7 @@ InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,
74267427 // Now compute and add the VPlan-based cost.
74277428 Cost += Plan.cost (VF, CostCtx);
74287429#ifndef NDEBUG
7429- unsigned EstimatedWidth = VF.getKnownMinValue ();
7430- if (VF.isScalable ())
7431- if (std::optional<unsigned > VScale = getVScaleForTuning (OrigLoop, TTI))
7432- EstimatedWidth *= *VScale;
7430+ unsigned EstimatedWidth = getEstimatedRuntimeVF (OrigLoop, CM.TTI , VF);
74337431 LLVM_DEBUG (dbgs () << " Cost for VF " << VF << " : " << Cost
74347432 << " (Estimated cost per lane: " );
74357433 if (Cost.isValid ()) {
@@ -9811,8 +9809,8 @@ static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) {
98119809}
98129810
98139811static bool areRuntimeChecksProfitable (GeneratedRTChecks &Checks,
9814- VectorizationFactor &VF,
9815- std::optional< unsigned > VScale, Loop *L ,
9812+ VectorizationFactor &VF, Loop *L,
9813+ const TargetTransformInfo &TTI ,
98169814 PredicatedScalarEvolution &PSE,
98179815 ScalarEpilogueLowering SEL) {
98189816 InstructionCost CheckCost = Checks.getCost ();
@@ -9864,13 +9862,7 @@ static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
98649862 // For now we assume the epilogue cost EpiC = 0 for simplicity. Note that
98659863 // the computations are performed on doubles, not integers and the result
98669864 // is rounded up, hence we get an upper estimate of the TC.
9867- unsigned IntVF = VF.Width .getKnownMinValue ();
9868- if (VF.Width .isScalable ()) {
9869- unsigned AssumedMinimumVscale = 1 ;
9870- if (VScale)
9871- AssumedMinimumVscale = *VScale;
9872- IntVF *= AssumedMinimumVscale;
9873- }
9865+ unsigned IntVF = getEstimatedRuntimeVF (L, TTI, VF.Width );
98749866 uint64_t RtC = *CheckCost.getValue ();
98759867 uint64_t Div = ScalarC * IntVF - *VF.Cost .getValue ();
98769868 uint64_t MinTC1 = Div == 0 ? 0 : divideCeil (RtC * IntVF, Div);
@@ -10119,8 +10111,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
1011910111 bool ForceVectorization =
1012010112 Hints.getForce () == LoopVectorizeHints::FK_Enabled;
1012110113 if (!ForceVectorization &&
10122- !areRuntimeChecksProfitable (Checks, VF, getVScaleForTuning (L, *TTI), L,
10123- PSE, SEL)) {
10114+ !areRuntimeChecksProfitable (Checks, VF, L, *TTI, PSE, SEL)) {
1012410115 ORE->emit ([&]() {
1012510116 return OptimizationRemarkAnalysisAliasing (
1012610117 DEBUG_TYPE, " CantReorderMemOps" , L->getStartLoc (),
0 commit comments