llvm · ytmukai · Aug 25, 2025 · Sep 2, 2025 · david-arm · Aug 29, 2025
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1231,6 +1231,13 @@ class TargetTransformInfo {
   LLVM_ABI bool
   shouldMaximizeVectorBandwidth(TargetTransformInfo::RegisterKind K) const;
 
+  /// \return True if vectorization factors wider than those matching the
+  /// largest element type should be chosen conservatively. This only makes
+  /// sense when shouldMaximizeVectorBandwidth returns true.
+  /// \p K Register Kind for vectorization.
+  LLVM_ABI bool shouldMaximizeVectorBandwidthConservatively(
+      TargetTransformInfo::RegisterKind K) const;
+
   /// \return The minimum vectorization factor for types of given element
   /// bit width, or 0 if there is no minimum VF. The returned value only
   /// applies when shouldMaximizeVectorBandwidth returns true.

diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -597,6 +597,11 @@ class TargetTransformInfoImplBase {
     return false;
   }
 
+  virtual bool shouldMaximizeVectorBandwidthConservatively(
+      TargetTransformInfo::RegisterKind K) const {
+    return false;
+  }
+
   virtual ElementCount getMinimumVF(unsigned ElemWidth, bool IsScalable) const {
     return ElementCount::get(0, IsScalable);
   }

diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -803,6 +803,11 @@ bool TargetTransformInfo::shouldMaximizeVectorBandwidth(
   return TTIImpl->shouldMaximizeVectorBandwidth(K);
 }
 
+bool TargetTransformInfo::shouldMaximizeVectorBandwidthConservatively(
+    TargetTransformInfo::RegisterKind K) const {
+  return TTIImpl->shouldMaximizeVectorBandwidthConservatively(K);
+}
+
 ElementCount TargetTransformInfo::getMinimumVF(unsigned ElemWidth,
                                                bool IsScalable) const {
   return TTIImpl->getMinimumVF(ElemWidth, IsScalable);

diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -76,6 +76,9 @@ static cl::opt<unsigned> DMBLookaheadThreshold(
     "dmb-lookahead-threshold", cl::init(10), cl::Hidden,
     cl::desc("The number of instructions to search for a redundant dmb"));
 
+static cl::opt<bool> EnableSVEMaximizeVecBW("enable-sve-maximize-vec-bw",
+                                            cl::init(false), cl::Hidden);
+
 namespace {
 class TailFoldingOption {
   // These bitfields will only ever be set to something non-zero in operator=,
@@ -370,7 +373,9 @@ bool AArch64TTIImpl::shouldMaximizeVectorBandwidth(
     TargetTransformInfo::RegisterKind K) const {
   assert(K != TargetTransformInfo::RGK_Scalar);
   return (K == TargetTransformInfo::RGK_FixedWidthVector &&
-          ST->isNeonAvailable());
+          ST->isNeonAvailable()) ||
+         (EnableSVEMaximizeVecBW &&
+          K == TargetTransformInfo::RGK_ScalableVector && ST->isSVEAvailable());
 }
 
 /// Calculate the cost of materializing a 64-bit value. This helper

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -474,7 +474,8 @@ class LoopVectorizationPlanner {
   ///
   /// TODO: Move to VPlan::cost once the use of LoopVectorizationLegality has
   /// been retired.
-  InstructionCost cost(VPlan &Plan, ElementCount VF) const;
+  InstructionCost cost(VPlan &Plan, ElementCount VF,
+                       bool CountsVecCalcOnly = false) const;
 
   /// Precompute costs for certain instructions using the legacy cost model. The
   /// function is used to bring up the VPlan-based cost model to initially avoid

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -263,6 +263,11 @@ static cl::opt<bool> MaximizeBandwidth(
     cl::desc("Maximize bandwidth when selecting vectorization factor which "
              "will be determined by the smallest type in loop."));
 
+static cl::opt<bool> MaximizeBandwidthConservatively(
+    "vectorizer-maximize-bandwidth-conservatively", cl::init(false), cl::Hidden,
+    cl::desc("When MaximizeBandwidth is enabled, a larger vector factor is "
+             "chosen conservatively."));
+
 static cl::opt<bool> EnableInterleavedMemAccesses(
     "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
     cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
@@ -962,9 +967,16 @@ class LoopVectorizationCostModel {
   /// user options, for the given register kind.
   bool useMaxBandwidth(TargetTransformInfo::RegisterKind RegKind);
 
+  /// \return True if maximizing vector bandwidth should be applied
+  /// conservatively by the target or user options, for the given register kind.
+  /// This only makes sense when useMaxBandwidth returns true.
+  bool useMaxBandwidthConservatively(TargetTransformInfo::RegisterKind RegKind);
+
   /// \return True if register pressure should be calculated for the given VF.
   bool shouldCalculateRegPressureForVF(ElementCount VF);
 
+  bool isVFForMaxBandwidth(ElementCount VF);
+
   /// \return The size (in bits) of the smallest and widest types in the code
   /// that needs to be vectorized. We ignore values that remain scalar such as
   /// 64 bit loop indices.
@@ -3812,11 +3824,15 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
 
 bool LoopVectorizationCostModel::shouldCalculateRegPressureForVF(
     ElementCount VF) {
+  // Only calculate register pressure for VFs enabled by MaxBandwidth.
+  return isVFForMaxBandwidth(VF);
+}
+
+bool LoopVectorizationCostModel::isVFForMaxBandwidth(ElementCount VF) {
   if (!useMaxBandwidth(VF.isScalable()
                            ? TargetTransformInfo::RGK_ScalableVector
                            : TargetTransformInfo::RGK_FixedWidthVector))
     return false;
-  // Only calculate register pressure for VFs enabled by MaxBandwidth.
   return ElementCount::isKnownGT(
       VF, VF.isScalable() ? MaxPermissibleVFWithoutMaxBW.ScalableVF
                           : MaxPermissibleVFWithoutMaxBW.FixedVF);
@@ -3830,6 +3846,13 @@ bool LoopVectorizationCostModel::useMaxBandwidth(
                                  Legal->hasVectorCallVariants())));
 }
 
+bool LoopVectorizationCostModel::useMaxBandwidthConservatively(
+    TargetTransformInfo::RegisterKind RegKind) {
+  return MaximizeBandwidthConservatively ||
+         (MaximizeBandwidthConservatively.getNumOccurrences() == 0 &&
+          TTI.shouldMaximizeVectorBandwidthConservatively(RegKind));
+}
+
 ElementCount LoopVectorizationCostModel::clampVFByMaxTripCount(
     ElementCount VF, unsigned MaxTripCount, bool FoldTailByMasking) const {
   unsigned EstimatedVF = VF.getKnownMinValue();
@@ -6923,13 +6946,16 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
   return Cost;
 }
 
-InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,
-                                               ElementCount VF) const {
+InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan, ElementCount VF,
+                                               bool CountsVecCalcOnly) const {
   VPCostContext CostCtx(CM.TTI, *CM.TLI, Plan, CM, CM.CostKind);
-  InstructionCost Cost = precomputeCosts(Plan, VF, CostCtx);
+  InstructionCost Cost;
+
+  if (!CountsVecCalcOnly)
+    Cost += precomputeCosts(Plan, VF, CostCtx);
 
   // Now compute and add the VPlan-based cost.
-  Cost += Plan.cost(VF, CostCtx);
+  Cost += Plan.cost(VF, CostCtx, CountsVecCalcOnly);
 #ifndef NDEBUG
   unsigned EstimatedWidth = estimateElementCount(VF, CM.getVScaleForTuning());
   LLVM_DEBUG(dbgs() << "Cost for VF " << VF << ": " << Cost
@@ -7105,8 +7131,25 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
         continue;
       }
 
-      if (isMoreProfitable(CurrentFactor, BestFactor, P->hasScalarTail()))
-        BestFactor = CurrentFactor;
+      if (isMoreProfitable(CurrentFactor, BestFactor, P->hasScalarTail())) {
+        if (CM.isVFForMaxBandwidth(VF) &&
+            CM.useMaxBandwidthConservatively(
+                VF.isScalable() ? TargetTransformInfo::RGK_ScalableVector
+                                : TargetTransformInfo::RGK_FixedWidthVector)) {
+          if (ElementCount::isKnownLT(BestFactor.Width, VF) &&
+              llvm::find(VFs, BestFactor.Width)) {
+            VectorizationFactor BestFactorVecCalc(
+                BestFactor.Width, cost(*P, BestFactor.Width, true), ScalarCost);
+            VectorizationFactor CurrentFactorVecCalc(VF, cost(*P, VF, true),
+                                                     ScalarCost);
+            if (isMoreProfitable(CurrentFactorVecCalc, BestFactorVecCalc,
+                                 P->hasScalarTail()))
+              BestFactor = CurrentFactor;
+          }
+        } else {
+          BestFactor = CurrentFactor;
+        }
+      }
 
       // If profitable add it to ProfitableVF list.
       if (isMoreProfitable(CurrentFactor, ScalarFactor, P->hasScalarTail()))
@@ -7131,13 +7174,19 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
   // Verify that the VPlan-based and legacy cost models agree, except for VPlans
   // with early exits and plans with additional VPlan simplifications. The
   // legacy cost model doesn't properly model costs for such loops.
-  assert((BestFactor.Width == LegacyVF.Width || BestPlan.hasEarlyExit() ||
-          planContainsAdditionalSimplifications(getPlanFor(BestFactor.Width),
-                                                CostCtx, OrigLoop,
-                                                BestFactor.Width) ||
-          planContainsAdditionalSimplifications(
-              getPlanFor(LegacyVF.Width), CostCtx, OrigLoop, LegacyVF.Width)) &&
-         " VPlan cost model and legacy cost model disagreed");
+  if (!CM.isVFForMaxBandwidth(LegacyVF.Width) ||
+      !CM.useMaxBandwidthConservatively(
+          LegacyVF.Width.isScalable()
+              ? TargetTransformInfo::RGK_ScalableVector
+              : TargetTransformInfo::RGK_FixedWidthVector))
+    assert((BestFactor.Width == LegacyVF.Width || BestPlan.hasEarlyExit() ||
+            planContainsAdditionalSimplifications(getPlanFor(BestFactor.Width),
+                                                  CostCtx, OrigLoop,
+                                                  BestFactor.Width) ||
+            planContainsAdditionalSimplifications(getPlanFor(LegacyVF.Width),
+                                                  CostCtx, OrigLoop,
+                                                  LegacyVF.Width)) &&
+           " VPlan cost model and legacy cost model disagreed");
   assert((BestFactor.Width.isScalar() || BestFactor.ScalarCost > 0) &&
          "when vectorizing, the scalar cost must be computed.");
 #endif

diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -800,10 +800,34 @@ void VPRegionBlock::execute(VPTransformState *State) {
   State->Lane.reset();
 }
 
-InstructionCost VPBasicBlock::cost(ElementCount VF, VPCostContext &Ctx) {
+InstructionCost VPBasicBlock::cost(ElementCount VF, VPCostContext &Ctx,
+                                   bool CountsVecCalcOnly) {
   InstructionCost Cost = 0;
-  for (VPRecipeBase &R : Recipes)
-    Cost += R.cost(VF, Ctx);
+  for (VPRecipeBase &R : Recipes) {
+    if (!CountsVecCalcOnly)
+      Cost += R.cost(VF, Ctx);
+    else {
+      switch (R.getVPDefID()) {
+      case VPDef::VPActiveLaneMaskPHISC:
+      case VPDef::VPBlendSC:
+      case VPDef::VPFirstOrderRecurrencePHISC:
+      case VPDef::VPPartialReductionSC:
+      case VPDef::VPReductionPHISC:
+      case VPDef::VPReductionSC:
+      case VPDef::VPWidenCallSC:
+      case VPDef::VPWidenCanonicalIVSC:
+      case VPDef::VPWidenCastSC:
+      case VPDef::VPWidenGEPSC:
+      case VPDef::VPWidenIntOrFpInductionSC:
+      case VPDef::VPWidenIntrinsicSC:
+      case VPDef::VPWidenPHISC:
+      case VPDef::VPWidenPointerInductionSC:
+      case VPDef::VPWidenSC:
+      case VPDef::VPWidenSelectSC:
+        Cost += R.cost(VF, Ctx);
+      }
+    }
+  }
   return Cost;
 }
 
@@ -826,11 +850,12 @@ const VPBasicBlock *VPBasicBlock::getCFGPredecessor(unsigned Idx) const {
   return Pred->getExitingBasicBlock();
 }
 
-InstructionCost VPRegionBlock::cost(ElementCount VF, VPCostContext &Ctx) {
+InstructionCost VPRegionBlock::cost(ElementCount VF, VPCostContext &Ctx,
+                                    bool CountsVecCalcOnly) {
   if (!isReplicator()) {
     InstructionCost Cost = 0;
     for (VPBlockBase *Block : vp_depth_first_shallow(getEntry()))
-      Cost += Block->cost(VF, Ctx);
+      Cost += Block->cost(VF, Ctx, CountsVecCalcOnly);
     InstructionCost BackedgeCost =
         ForceTargetInstructionCost.getNumOccurrences()
             ? InstructionCost(ForceTargetInstructionCost.getNumOccurrences())
@@ -853,7 +878,7 @@ InstructionCost VPRegionBlock::cost(ElementCount VF, VPCostContext &Ctx) {
   // uniform condition.
   using namespace llvm::VPlanPatternMatch;
   VPBasicBlock *Then = cast<VPBasicBlock>(getEntry()->getSuccessors()[0]);
-  InstructionCost ThenCost = Then->cost(VF, Ctx);
+  InstructionCost ThenCost = Then->cost(VF, Ctx, CountsVecCalcOnly);
 
   // For the scalar case, we may not always execute the original predicated
   // block, Thus, scale the block's cost by the probability of executing it.
@@ -1016,19 +1041,22 @@ void VPlan::execute(VPTransformState *State) {
   }
 }
 
-InstructionCost VPlan::cost(ElementCount VF, VPCostContext &Ctx) {
+InstructionCost VPlan::cost(ElementCount VF, VPCostContext &Ctx,
+                            bool CountsVecCalcOnly) {
   // For now only return the cost of the vector loop region, ignoring any other
   // blocks, like the preheader or middle blocks, expect for checking them for
   // recipes with invalid costs.
-  InstructionCost Cost = getVectorLoopRegion()->cost(VF, Ctx);
+  InstructionCost Cost =
+      getVectorLoopRegion()->cost(VF, Ctx, CountsVecCalcOnly);
 
   // If the cost of the loop region is invalid or any recipe in the skeleton
   // outside loop regions are invalid return an invalid cost.
-  if (!Cost.isValid() || any_of(VPBlockUtils::blocksOnly<VPBasicBlock>(
-                                    vp_depth_first_shallow(getEntry())),
-                                [&VF, &Ctx](VPBasicBlock *VPBB) {
-                                  return !VPBB->cost(VF, Ctx).isValid();
-                                }))
+  if (!Cost.isValid() ||
+      any_of(VPBlockUtils::blocksOnly<VPBasicBlock>(
+                 vp_depth_first_shallow(getEntry())),
+             [&VF, &Ctx, &CountsVecCalcOnly](VPBasicBlock *VPBB) {
+               return !VPBB->cost(VF, Ctx, CountsVecCalcOnly).isValid();
+             }))
     return InstructionCost::getInvalid();
 
   return Cost;

diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -340,7 +340,8 @@ class LLVM_ABI_FOR_TEST VPBlockBase {
   virtual void execute(VPTransformState *State) = 0;
 
   /// Return the cost of the block.
-  virtual InstructionCost cost(ElementCount VF, VPCostContext &Ctx) = 0;
+  virtual InstructionCost cost(ElementCount VF, VPCostContext &Ctx,
+                               bool CountsVecCalcOnly = false) = 0;
 
   /// Return true if it is legal to hoist instructions into this block.
   bool isLegalToHoistInto() {
@@ -3716,7 +3717,8 @@ class LLVM_ABI_FOR_TEST VPBasicBlock : public VPBlockBase {
   void execute(VPTransformState *State) override;
 
   /// Return the cost of this VPBasicBlock.
-  InstructionCost cost(ElementCount VF, VPCostContext &Ctx) override;
+  InstructionCost cost(ElementCount VF, VPCostContext &Ctx,
+                       bool CountsVecCalcOnly) override;
 
   /// Return the position of the first non-phi node recipe in the block.
   iterator getFirstNonPhi();
@@ -3897,7 +3899,8 @@ class LLVM_ABI_FOR_TEST VPRegionBlock : public VPBlockBase {
   void execute(VPTransformState *State) override;
 
   // Return the cost of this region.
-  InstructionCost cost(ElementCount VF, VPCostContext &Ctx) override;
+  InstructionCost cost(ElementCount VF, VPCostContext &Ctx,
+                       bool CountsVecCalcOnly) override;
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   /// Print this VPRegionBlock to \p O (recursively), prefixing all lines with
@@ -4022,7 +4025,8 @@ class VPlan {
   void execute(VPTransformState *State);
 
   /// Return the cost of this plan.
-  InstructionCost cost(ElementCount VF, VPCostContext &Ctx);
+  InstructionCost cost(ElementCount VF, VPCostContext &Ctx,
+                       bool CountsVecCalcOnly = false);
 
   VPBasicBlock *getEntry() { return Entry; }
   const VPBasicBlock *getEntry() const { return Entry; }

diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/maximize-bandwidth-conservatively.ll b/llvm/test/Transforms/LoopVectorize/AArch64/maximize-bandwidth-conservatively.ll
@@ -0,0 +1,58 @@
+; REQUIRES: asserts
+; RUN: opt < %s -mtriple aarch64-linux-gnu -mattr=+sve -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -debug-only=loop-vectorize 2>&1 | FileCheck %s
+; RUN: opt < %s -mtriple aarch64-linux-gnu -mattr=+sve -passes=loop-vectorize -vectorizer-maximize-bandwidth -vectorizer-maximize-bandwidth-conservatively -S -debug-only=loop-vectorize 2>&1 | FileCheck %s --check-prefix=CHECK-CONS
+
+define void @f(i32 %n, ptr noalias %a, ptr %b, ptr %c) {
+; The following loop is an example where choosing a larger vector width reduces
+; the number of instructions but may lead to performance degradation due to the
+; FP pipeline becoming a bottleneck.
+; 
+; void f(int n, short *restrict a, long *b, double *c) {
+;   for (int i = 0; i < n; i++) {
+;     a[i] = b[i] + c[i];
+;   }
+; }
+
+; In the usual cost model, vscale x 8 is chosen.
+; CHECK: Cost for VF vscale x 2: 8 (Estimated cost per lane: 4.0)
+; CHECK: Cost for VF vscale x 4: 14 (Estimated cost per lane: 3.5)
+; CHECK: Cost for VF vscale x 8: 26 (Estimated cost per lane: 3.2)
+; CHECK: LV: Selecting VF: vscale x 8.
+
+; In a conservative cost model, a larger vector width is chosen only if it is
+; superior when compared solely based on the cost of the FP pipeline, in
+; addition to the usual model.
+; CHECK-CONS: Cost for VF vscale x 2: 3 (Estimated cost per lane: 1.5)
+; CHECK-CONS: Cost for VF vscale x 4: 7 (Estimated cost per lane: 1.8)
+; CHECK-CONS: Cost for VF vscale x 8: 15 (Estimated cost per lane: 1.9)
+; CHECK-CONS: LV: Selecting VF: vscale x 2.
+
+entry:
+  %cmp10 = icmp sgt i32 %n, 0
+  br i1 %cmp10, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext nneg i32 %n to i64
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds nuw i64, ptr %b, i64 %indvars.iv
+  %0 = load i64, ptr %arrayidx, align 8
+  %conv = sitofp i64 %0 to double
+  %arrayidx2 = getelementptr inbounds nuw double, ptr %c, i64 %indvars.iv
+  %1 = load double, ptr %arrayidx2, align 8
+  %add = fadd double %1, %conv
+  %conv3 = fptosi double %add to i16
+  %arrayidx5 = getelementptr inbounds nuw i16, ptr %a, i64 %indvars.iv
+  store i16 %conv3, ptr %arrayidx5, align 2
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}