@@ -1554,9 +1554,32 @@ class LoopVectorizationCostModel {
15541554 // / trivially hoistable.
15551555 bool shouldConsiderInvariant (Value *Op);
15561556
1557+ // / Return the value of vscale used for tuning the cost model.
1558+ std::optional<unsigned > getVScaleForTuning () const { return VScaleForTuning; }
1559+
15571560private:
15581561 unsigned NumPredStores = 0 ;
15591562
1563+ std::optional<unsigned > VScaleForTuning;
1564+
1565+ // / Initializes the value of vscale used for tuning the cost model. If
1566+ // / vscale_range.min == vscale_range.max then return vscale_range.max, else
1567+ // / return the value returned by the corresponding TTI method.
1568+ void initializeVScaleForTuning () {
1569+ const Function *Fn = TheLoop->getHeader ()->getParent ();
1570+ if (Fn->hasFnAttribute (Attribute::VScaleRange)) {
1571+ auto Attr = Fn->getFnAttribute (Attribute::VScaleRange);
1572+ auto Min = Attr.getVScaleRangeMin ();
1573+ auto Max = Attr.getVScaleRangeMax ();
1574+ if (Max && Min == Max) {
1575+ VScaleForTuning = Max;
1576+ return ;
1577+ }
1578+ }
1579+
1580+ VScaleForTuning = TTI.getVScaleForTuning ();
1581+ }
1582+
15601583 // / \return An upper bound for the vectorization factors for both
15611584 // / fixed and scalable vectorization, where the minimum-known number of
15621585 // / elements is a power-of-2 larger than zero. If scalable vectorization is
@@ -3838,6 +3861,11 @@ FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
38383861 if (!Legal->isSafeForAnyVectorWidth ())
38393862 this ->MaxSafeElements = MaxSafeElements;
38403863
3864+ if (MaxSafeScalableVF != ElementCount::getScalable (0 )) {
3865+ // Cache the value of vscale for tuning, since we'll need it.
3866+ initializeVScaleForTuning ();
3867+ }
3868+
38413869 LLVM_DEBUG (dbgs () << " LV: The max safe fixed VF is: " << MaxSafeFixedVF
38423870 << " .\n " );
38433871 LLVM_DEBUG (dbgs () << " LV: The max safe scalable VF is: " << MaxSafeScalableVF
@@ -4231,33 +4259,15 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
42314259 return MaxVF;
42324260}
42334261
4234- // / Convenience function that returns the value of vscale_range iff
4235- // / vscale_range.min == vscale_range.max or otherwise returns the value
4236- // / returned by the corresponding TTI method.
4237- static std::optional<unsigned >
4238- getVScaleForTuning (const Loop *L, const TargetTransformInfo &TTI) {
4239- const Function *Fn = L->getHeader ()->getParent ();
4240- if (Fn->hasFnAttribute (Attribute::VScaleRange)) {
4241- auto Attr = Fn->getFnAttribute (Attribute::VScaleRange);
4242- auto Min = Attr.getVScaleRangeMin ();
4243- auto Max = Attr.getVScaleRangeMax ();
4244- if (Max && Min == Max)
4245- return Max;
4246- }
4247-
4248- return TTI.getVScaleForTuning ();
4249- }
4250-
42514262// / This function attempts to return a value that represents the vectorization
42524263// / factor at runtime. For fixed-width VFs we know this precisely at compile
42534264// / time, but for scalable VFs we calculate it based on an estimate of the
42544265// / vscale value.
4255- static unsigned getEstimatedRuntimeVF (const Loop *L,
4256- const TargetTransformInfo &TTI,
4257- ElementCount VF) {
4266+ static unsigned getEstimatedRuntimeVF (ElementCount VF,
4267+ std::optional<unsigned > VScale) {
42584268 unsigned EstimatedVF = VF.getKnownMinValue ();
42594269 if (VF.isScalable ())
4260- if (std::optional< unsigned > VScale = getVScaleForTuning (L, TTI) )
4270+ if (VScale)
42614271 EstimatedVF *= *VScale;
42624272 assert (EstimatedVF >= 1 && " Estimated VF shouldn't be less than 1" );
42634273 return EstimatedVF;
@@ -4272,7 +4282,7 @@ bool LoopVectorizationPlanner::isMoreProfitable(
42724282 // Improve estimate for the vector width if it is scalable.
42734283 unsigned EstimatedWidthA = A.Width .getKnownMinValue ();
42744284 unsigned EstimatedWidthB = B.Width .getKnownMinValue ();
4275- if (std::optional<unsigned > VScale = getVScaleForTuning (OrigLoop, TTI )) {
4285+ if (std::optional<unsigned > VScale = CM. getVScaleForTuning ()) {
42764286 if (A.Width .isScalable ())
42774287 EstimatedWidthA *= *VScale;
42784288 if (B.Width .isScalable ())
@@ -4565,13 +4575,13 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
45654575 InstructionCost C = CM.expectedCost (VF);
45664576 VectorizationFactor Candidate (VF, C, ScalarCost.ScalarCost );
45674577
4568- unsigned Width = getEstimatedRuntimeVF (OrigLoop, TTI, Candidate.Width );
4578+ unsigned Width =
4579+ getEstimatedRuntimeVF (Candidate.Width , CM.getVScaleForTuning ());
45694580 LLVM_DEBUG (dbgs () << " LV: Vector loop of width " << VF
45704581 << " costs: " << (Candidate.Cost / Width));
45714582 if (VF.isScalable ())
45724583 LLVM_DEBUG (dbgs () << " (assuming a minimum vscale of "
4573- << getVScaleForTuning (OrigLoop, TTI).value_or (1 )
4574- << " )" );
4584+ << CM.getVScaleForTuning ().value_or (1 ) << " )" );
45754585 LLVM_DEBUG (dbgs () << " .\n " );
45764586
45774587 if (!ForceVectorization && !willGenerateVectors (*P, VF, TTI)) {
@@ -4660,7 +4670,8 @@ bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
46604670 unsigned MinVFThreshold = EpilogueVectorizationMinVF.getNumOccurrences () > 0
46614671 ? EpilogueVectorizationMinVF
46624672 : TTI.getEpilogueVectorizationMinVF ();
4663- return getEstimatedRuntimeVF (TheLoop, TTI, VF * Multiplier) >= MinVFThreshold;
4673+ return getEstimatedRuntimeVF (VF * Multiplier, VScaleForTuning) >=
4674+ MinVFThreshold;
46644675}
46654676
46664677VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor (
@@ -4712,8 +4723,8 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
47124723 // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know
47134724 // the main loop handles 8 lanes per iteration. We could still benefit from
47144725 // vectorizing the epilogue loop with VF=4.
4715- ElementCount EstimatedRuntimeVF =
4716- ElementCount::getFixed ( getEstimatedRuntimeVF (OrigLoop, TTI, MainLoopVF ));
4726+ ElementCount EstimatedRuntimeVF = ElementCount::getFixed (
4727+ getEstimatedRuntimeVF (MainLoopVF, CM. getVScaleForTuning () ));
47174728
47184729 ScalarEvolution &SE = *PSE.getSE ();
47194730 Type *TCType = Legal->getWidestInductionType ();
@@ -4959,7 +4970,7 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
49594970 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
49604971 }
49614972
4962- unsigned EstimatedVF = getEstimatedRuntimeVF (TheLoop, TTI, VF );
4973+ unsigned EstimatedVF = getEstimatedRuntimeVF (VF, VScaleForTuning );
49634974 unsigned KnownTC = PSE.getSE ()->getSmallConstantTripCount (TheLoop);
49644975 if (KnownTC > 0 ) {
49654976 // At least one iteration must be scalar when this constraint holds. So the
@@ -7388,7 +7399,7 @@ InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,
73887399 // Now compute and add the VPlan-based cost.
73897400 Cost += Plan.cost (VF, CostCtx);
73907401#ifndef NDEBUG
7391- unsigned EstimatedWidth = getEstimatedRuntimeVF (OrigLoop , CM.TTI , VF );
7402+ unsigned EstimatedWidth = getEstimatedRuntimeVF (VF , CM.getVScaleForTuning () );
73927403 LLVM_DEBUG (dbgs () << " Cost for VF " << VF << " : " << Cost
73937404 << " (Estimated cost per lane: " );
73947405 if (Cost.isValid ()) {
@@ -10033,9 +10044,9 @@ static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) {
1003310044
1003410045static bool areRuntimeChecksProfitable (GeneratedRTChecks &Checks,
1003510046 VectorizationFactor &VF, Loop *L,
10036- const TargetTransformInfo &TTI,
1003710047 PredicatedScalarEvolution &PSE,
10038- ScalarEpilogueLowering SEL) {
10048+ ScalarEpilogueLowering SEL,
10049+ std::optional<unsigned > VScale) {
1003910050 InstructionCost CheckCost = Checks.getCost ();
1004010051 if (!CheckCost.isValid ())
1004110052 return false ;
@@ -10085,7 +10096,7 @@ static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
1008510096 // For now we assume the epilogue cost EpiC = 0 for simplicity. Note that
1008610097 // the computations are performed on doubles, not integers and the result
1008710098 // is rounded up, hence we get an upper estimate of the TC.
10088- unsigned IntVF = getEstimatedRuntimeVF (L, TTI, VF.Width );
10099+ unsigned IntVF = getEstimatedRuntimeVF (VF.Width , VScale );
1008910100 uint64_t RtC = *CheckCost.getValue ();
1009010101 uint64_t Div = ScalarC * IntVF - *VF.Cost .getValue ();
1009110102 uint64_t MinTC1 = Div == 0 ? 0 : divideCeil (RtC * IntVF, Div);
@@ -10522,7 +10533,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {
1052210533 bool ForceVectorization =
1052310534 Hints.getForce () == LoopVectorizeHints::FK_Enabled;
1052410535 if (!ForceVectorization &&
10525- !areRuntimeChecksProfitable (Checks, VF, L, *TTI, PSE, SEL)) {
10536+ !areRuntimeChecksProfitable (Checks, VF, L, PSE, SEL,
10537+ CM.getVScaleForTuning ())) {
1052610538 ORE->emit ([&]() {
1052710539 return OptimizationRemarkAnalysisAliasing (
1052810540 DEBUG_TYPE, " CantReorderMemOps" , L->getStartLoc (),
0 commit comments