@@ -989,7 +989,10 @@ class LoopVectorizationCostModel {
989989 InterleavedAccessInfo &IAI)
990990 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
991991 TTI (TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
992- Hints(Hints), InterleaveInfo(IAI), CostKind(TTI::TCK_RecipThroughput) {}
992+ Hints(Hints), InterleaveInfo(IAI), CostKind(TTI::TCK_RecipThroughput) {
993+ if (TTI.supportsScalableVectors () || ForceTargetSupportsScalableVectors)
994+ initializeVScaleForTuning ();
995+ }
993996
994997 // / \return An upper bound for the vectorization factors (both fixed and
995998 // / scalable). If the factors are 0, vectorization and interleaving should be
@@ -1565,9 +1568,34 @@ class LoopVectorizationCostModel {
15651568 // / trivially hoistable.
15661569 bool shouldConsiderInvariant (Value *Op);
15671570
1571+ // / Return the value of vscale used for tuning the cost model.
1572+ std::optional<unsigned > getVScaleForTuning () const { return VScaleForTuning; }
1573+
15681574private:
15691575 unsigned NumPredStores = 0 ;
15701576
1577+ // / Used to store the value of vscale used for tuning the cost model. It is
1578+ // / initialized during object construction.
1579+ std::optional<unsigned > VScaleForTuning;
1580+
1581+ // / Initializes the value of vscale used for tuning the cost model. If
1582+ // / vscale_range.min == vscale_range.max then return vscale_range.max, else
1583+ // / return the value returned by the corresponding TTI method.
1584+ void initializeVScaleForTuning () {
1585+ const Function *Fn = TheLoop->getHeader ()->getParent ();
1586+ if (Fn->hasFnAttribute (Attribute::VScaleRange)) {
1587+ auto Attr = Fn->getFnAttribute (Attribute::VScaleRange);
1588+ auto Min = Attr.getVScaleRangeMin ();
1589+ auto Max = Attr.getVScaleRangeMax ();
1590+ if (Max && Min == Max) {
1591+ VScaleForTuning = Max;
1592+ return ;
1593+ }
1594+ }
1595+
1596+ VScaleForTuning = TTI.getVScaleForTuning ();
1597+ }
1598+
15711599 // / \return An upper bound for the vectorization factors for both
15721600 // / fixed and scalable vectorization, where the minimum-known number of
15731601 // / elements is a power-of-2 larger than zero. If scalable vectorization is
@@ -4242,33 +4270,15 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
42424270 return MaxVF;
42434271}
42444272
4245- // / Convenience function that returns the value of vscale_range iff
4246- // / vscale_range.min == vscale_range.max or otherwise returns the value
4247- // / returned by the corresponding TTI method.
4248- static std::optional<unsigned >
4249- getVScaleForTuning (const Loop *L, const TargetTransformInfo &TTI) {
4250- const Function *Fn = L->getHeader ()->getParent ();
4251- if (Fn->hasFnAttribute (Attribute::VScaleRange)) {
4252- auto Attr = Fn->getFnAttribute (Attribute::VScaleRange);
4253- auto Min = Attr.getVScaleRangeMin ();
4254- auto Max = Attr.getVScaleRangeMax ();
4255- if (Max && Min == Max)
4256- return Max;
4257- }
4258-
4259- return TTI.getVScaleForTuning ();
4260- }
4261-
42624273// / This function attempts to return a value that represents the vectorization
42634274// / factor at runtime. For fixed-width VFs we know this precisely at compile
42644275// / time, but for scalable VFs we calculate it based on an estimate of the
42654276// / vscale value.
4266- static unsigned getEstimatedRuntimeVF (const Loop *L,
4267- const TargetTransformInfo &TTI,
4268- ElementCount VF) {
4277+ static unsigned getEstimatedRuntimeVF (ElementCount VF,
4278+ std::optional<unsigned > VScale) {
42694279 unsigned EstimatedVF = VF.getKnownMinValue ();
42704280 if (VF.isScalable ())
4271- if (std::optional< unsigned > VScale = getVScaleForTuning (L, TTI) )
4281+ if (VScale)
42724282 EstimatedVF *= *VScale;
42734283 assert (EstimatedVF >= 1 && " Estimated VF shouldn't be less than 1" );
42744284 return EstimatedVF;
@@ -4283,7 +4293,7 @@ bool LoopVectorizationPlanner::isMoreProfitable(
42834293 // Improve estimate for the vector width if it is scalable.
42844294 unsigned EstimatedWidthA = A.Width .getKnownMinValue ();
42854295 unsigned EstimatedWidthB = B.Width .getKnownMinValue ();
4286- if (std::optional<unsigned > VScale = getVScaleForTuning (OrigLoop, TTI )) {
4296+ if (std::optional<unsigned > VScale = CM. getVScaleForTuning ()) {
42874297 if (A.Width .isScalable ())
42884298 EstimatedWidthA *= *VScale;
42894299 if (B.Width .isScalable ())
@@ -4576,13 +4586,13 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
45764586 InstructionCost C = CM.expectedCost (VF);
45774587 VectorizationFactor Candidate (VF, C, ScalarCost.ScalarCost );
45784588
4579- unsigned Width = getEstimatedRuntimeVF (OrigLoop, TTI, Candidate.Width );
4589+ unsigned Width =
4590+ getEstimatedRuntimeVF (Candidate.Width , CM.getVScaleForTuning ());
45804591 LLVM_DEBUG (dbgs () << " LV: Vector loop of width " << VF
45814592 << " costs: " << (Candidate.Cost / Width));
45824593 if (VF.isScalable ())
45834594 LLVM_DEBUG (dbgs () << " (assuming a minimum vscale of "
4584- << getVScaleForTuning (OrigLoop, TTI).value_or (1 )
4585- << " )" );
4595+ << CM.getVScaleForTuning ().value_or (1 ) << " )" );
45864596 LLVM_DEBUG (dbgs () << " .\n " );
45874597
45884598 if (!ForceVectorization && !willGenerateVectors (*P, VF, TTI)) {
@@ -4671,7 +4681,8 @@ bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
46714681 unsigned MinVFThreshold = EpilogueVectorizationMinVF.getNumOccurrences () > 0
46724682 ? EpilogueVectorizationMinVF
46734683 : TTI.getEpilogueVectorizationMinVF ();
4674- return getEstimatedRuntimeVF (TheLoop, TTI, VF * Multiplier) >= MinVFThreshold;
4684+ return getEstimatedRuntimeVF (VF * Multiplier, VScaleForTuning) >=
4685+ MinVFThreshold;
46754686}
46764687
46774688VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor (
@@ -4723,8 +4734,8 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
47234734 // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know
47244735 // the main loop handles 8 lanes per iteration. We could still benefit from
47254736 // vectorizing the epilogue loop with VF=4.
4726- ElementCount EstimatedRuntimeVF =
4727- ElementCount::getFixed ( getEstimatedRuntimeVF (OrigLoop, TTI, MainLoopVF ));
4737+ ElementCount EstimatedRuntimeVF = ElementCount::getFixed (
4738+ getEstimatedRuntimeVF (MainLoopVF, CM. getVScaleForTuning () ));
47284739
47294740 ScalarEvolution &SE = *PSE.getSE ();
47304741 Type *TCType = Legal->getWidestInductionType ();
@@ -4970,7 +4981,7 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
49704981 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
49714982 }
49724983
4973- unsigned EstimatedVF = getEstimatedRuntimeVF (TheLoop, TTI, VF );
4984+ unsigned EstimatedVF = getEstimatedRuntimeVF (VF, VScaleForTuning );
49744985 unsigned KnownTC = PSE.getSE ()->getSmallConstantTripCount (TheLoop);
49754986 if (KnownTC > 0 ) {
49764987 // At least one iteration must be scalar when this constraint holds. So the
@@ -7399,7 +7410,7 @@ InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,
73997410 // Now compute and add the VPlan-based cost.
74007411 Cost += Plan.cost (VF, CostCtx);
74017412#ifndef NDEBUG
7402- unsigned EstimatedWidth = getEstimatedRuntimeVF (OrigLoop , CM.TTI , VF );
7413+ unsigned EstimatedWidth = getEstimatedRuntimeVF (VF , CM.getVScaleForTuning () );
74037414 LLVM_DEBUG (dbgs () << " Cost for VF " << VF << " : " << Cost
74047415 << " (Estimated cost per lane: " );
74057416 if (Cost.isValid ()) {
@@ -10063,9 +10074,9 @@ static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) {
1006310074
1006410075static bool areRuntimeChecksProfitable (GeneratedRTChecks &Checks,
1006510076 VectorizationFactor &VF, Loop *L,
10066- const TargetTransformInfo &TTI,
1006710077 PredicatedScalarEvolution &PSE,
10068- ScalarEpilogueLowering SEL) {
10078+ ScalarEpilogueLowering SEL,
10079+ std::optional<unsigned > VScale) {
1006910080 InstructionCost CheckCost = Checks.getCost ();
1007010081 if (!CheckCost.isValid ())
1007110082 return false ;
@@ -10115,7 +10126,7 @@ static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
1011510126 // For now we assume the epilogue cost EpiC = 0 for simplicity. Note that
1011610127 // the computations are performed on doubles, not integers and the result
1011710128 // is rounded up, hence we get an upper estimate of the TC.
10118- unsigned IntVF = getEstimatedRuntimeVF (L, TTI, VF.Width );
10129+ unsigned IntVF = getEstimatedRuntimeVF (VF.Width , VScale );
1011910130 uint64_t RtC = *CheckCost.getValue ();
1012010131 uint64_t Div = ScalarC * IntVF - *VF.Cost .getValue ();
1012110132 uint64_t MinTC1 = Div == 0 ? 0 : divideCeil (RtC * IntVF, Div);
@@ -10552,7 +10563,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {
1055210563 bool ForceVectorization =
1055310564 Hints.getForce () == LoopVectorizeHints::FK_Enabled;
1055410565 if (!ForceVectorization &&
10555- !areRuntimeChecksProfitable (Checks, VF, L, *TTI, PSE, SEL)) {
10566+ !areRuntimeChecksProfitable (Checks, VF, L, PSE, SEL,
10567+ CM.getVScaleForTuning ())) {
1055610568 ORE->emit ([&]() {
1055710569 return OptimizationRemarkAnalysisAliasing (
1055810570 DEBUG_TYPE, " CantReorderMemOps" , L->getStartLoc (),
0 commit comments