llvm · kmclaughlin-arm · Oct 13, 2025 · david-arm · Oct 16, 2025 · david-arm
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -209,9 +209,10 @@ struct TailFoldingInfo {
   TargetLibraryInfo *TLI;
   LoopVectorizationLegality *LVL;
   InterleavedAccessInfo *IAI;
+  bool UseWideLaneMask;
   TailFoldingInfo(TargetLibraryInfo *TLI, LoopVectorizationLegality *LVL,
-                  InterleavedAccessInfo *IAI)
-      : TLI(TLI), LVL(LVL), IAI(IAI) {}
+                  InterleavedAccessInfo *IAI, bool UseWideLaneMask = false)
+      : TLI(TLI), LVL(LVL), IAI(IAI), UseWideLaneMask(UseWideLaneMask) {}
 };
 
 class TargetTransformInfo;

diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -957,10 +957,18 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     return TyL.first + ExtraCost;
   }
   case Intrinsic::get_active_lane_mask: {
-    auto *RetTy = dyn_cast<FixedVectorType>(ICA.getReturnType());
-    if (RetTy) {
-      EVT RetVT = getTLI()->getValueType(DL, RetTy);
-      EVT OpVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
+    auto RetTy = cast<VectorType>(ICA.getReturnType());
+    EVT RetVT = getTLI()->getValueType(DL, RetTy);
+    EVT OpVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
+    if (RetTy->isScalableTy()) {
+      if (getTLI()->shouldExpandGetActiveLaneMask(RetVT, OpVT) ||
+          (!ST->hasSVE2p1() && !ST->hasSME2()) ||
+          TLI->getTypeAction(RetTy->getContext(), RetVT) !=
+              TargetLowering::TypeSplitVector)
+        break;
+      auto LT = getTypeLegalizationCost(RetTy);
+      return LT.first / 2;
+    } else {
       if (!getTLI()->shouldExpandGetActiveLaneMask(RetVT, OpVT) &&
           !getTLI()->isTypeLegal(RetVT)) {
         // We don't have enough context at this point to determine if the mask
@@ -972,7 +980,7 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
         // NOTE: getScalarizationOverhead returns a cost that's far too
         // pessimistic for the actual generated codegen. In reality there are
         // two instructions generated per lane.
-        return RetTy->getNumElements() * 2;
+        return cast<FixedVectorType>(RetTy)->getNumElements() * 2;
       }
     }
     break;
@@ -6146,8 +6154,11 @@ bool AArch64TTIImpl::preferPredicateOverEpilogue(TailFoldingInfo *TFI) const {
   if (Required == TailFoldingOpts::Disabled)
     Required |= TailFoldingOpts::Simple;
 
-  if (!TailFoldingOptionLoc.satisfies(ST->getSVETailFoldingDefaultOpts(),
-                                      Required))
+  TailFoldingOpts DefaultOpts = ST->getSVETailFoldingDefaultOpts();
+  if (TFI->UseWideLaneMask)
+    DefaultOpts |= TailFoldingOpts::Simple;
+
+  if (!TailFoldingOptionLoc.satisfies(DefaultOpts, Required))
     return false;
 
   // Don't tail-fold for tight loops where we would be better off interleaving

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -249,6 +249,10 @@ static cl::opt<TailFoldingStyle> ForceTailFoldingStyle(
                    "Use predicated EVL instructions for tail folding. If EVL "
                    "is unsupported, fallback to data-without-lane-mask.")));
 
+cl::opt<bool> llvm::EnableWideActiveLaneMask(
+    "enable-wide-lane-mask", cl::init(false), cl::Hidden,
+    cl::desc("Enable use of wide get active lane mask instructions"));
+
 static cl::opt<bool> MaximizeBandwidth(
     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
     cl::desc("Maximize bandwidth when selecting vectorization factor which "
@@ -1346,6 +1350,15 @@ class LoopVectorizationCostModel {
     return getTailFoldingStyle() != TailFoldingStyle::None;
   }
 
+  bool useWideActiveLaneMask() const {
+    if (!EnableWideActiveLaneMask)
+      return false;
+
+    TailFoldingStyle TF = getTailFoldingStyle();
+    return TF == TailFoldingStyle::DataAndControlFlow ||
+           TF == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
+  }
+
   /// Return maximum safe number of elements to be processed per vector
   /// iteration, which do not prevent store-load forwarding and are safe with
   /// regard to the memory dependencies. Required for EVL-based VPlans to
@@ -4518,7 +4531,7 @@ LoopVectorizationPlanner::selectInterleaveCount(VPlan &Plan, ElementCount VF,
   // 3. We don't interleave if we think that we will spill registers to memory
   // due to the increased register pressure.
 
-  if (!CM.isScalarEpilogueAllowed())
+  if (!CM.isScalarEpilogueAllowed() && !CM.useWideActiveLaneMask())
     return 1;
 
   if (any_of(Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis(),
@@ -8995,7 +9008,7 @@ static ScalarEpilogueLowering getScalarEpilogueLowering(
   };
 
   // 4) if the TTI hook indicates this is profitable, request predication.
-  TailFoldingInfo TFI(TLI, &LVL, IAI);
+  TailFoldingInfo TFI(TLI, &LVL, IAI, EnableWideActiveLaneMask);
   if (TTI->preferPredicateOverEpilogue(&TFI))
     return CM_ScalarEpilogueNotNeededUsePredicate;
 

diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -40,10 +40,6 @@
 using namespace llvm;
 using namespace VPlanPatternMatch;
 
-static cl::opt<bool> EnableWideActiveLaneMask(
-    "enable-wide-lane-mask", cl::init(false), cl::Hidden,
-    cl::desc("Enable use of wide get active lane mask instructions"));
-
 bool VPlanTransforms::tryToConvertVPInstructionsToVPRecipes(
     VPlan &Plan,
     function_ref<const InductionDescriptor *(PHINode *)>

diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -32,6 +32,7 @@ class VPRecipeBuilder;
 struct VFRange;
 
 extern cl::opt<bool> VerifyEachVPlan;
+extern cl::opt<bool> EnableWideActiveLaneMask;
 
 struct VPlanTransforms {
   /// Helper to run a VPlan transform \p Transform on \p VPlan, forwarding extra