[LV] Consider interleaving & tail-folding when -enable-wide-lane-mask=true

kmclaughlin-arm · kmclaughlin-arm · commit 8fa2cc55171e · 2025-10-14T12:48:56.000Z
Currently the only way to enable the use of wide active lane masks is to pass
-enable-wide-lane-mask and force both interleaving &amp; tail-folding with additional
flags. This patch changes selectInterleaveCount &amp; preferPredicateOverEpilogue to
consider both interleaving and tail-folding if wide lane masks were requested,
although the feature remains off by default.

Basic cost model changes are also included which reduce the cost of the
get.active.lane.mask intrinsic when the return type would require splitting,
but we know the whilelo (predicate pair) instruction can be used.
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -209,9 +209,10 @@ struct TailFoldingInfo {
   TargetLibraryInfo *TLI;
   LoopVectorizationLegality *LVL;
   InterleavedAccessInfo *IAI;
+  bool UseWideLaneMask;
   TailFoldingInfo(TargetLibraryInfo *TLI, LoopVectorizationLegality *LVL,
-                  InterleavedAccessInfo *IAI)
-      : TLI(TLI), LVL(LVL), IAI(IAI) {}
+                  InterleavedAccessInfo *IAI, bool UseWideLaneMask = false)
+      : TLI(TLI), LVL(LVL), IAI(IAI), UseWideLaneMask(UseWideLaneMask) {}
 };
 
 class TargetTransformInfo;
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -957,10 +957,18 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     return TyL.first + ExtraCost;
   }
   case Intrinsic::get_active_lane_mask: {
-    auto *RetTy = dyn_cast<FixedVectorType>(ICA.getReturnType());
-    if (RetTy) {
-      EVT RetVT = getTLI()->getValueType(DL, RetTy);
-      EVT OpVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
+    auto RetTy = cast<VectorType>(ICA.getReturnType());
+    EVT RetVT = getTLI()->getValueType(DL, RetTy);
+    EVT OpVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
+    if (RetTy->isScalableTy()) {
+      if (getTLI()->shouldExpandGetActiveLaneMask(RetVT, OpVT) ||
+          (!ST->hasSVE2p1() && !ST->hasSME2()) ||
+          TLI->getTypeAction(RetTy->getContext(), RetVT) !=
+              TargetLowering::TypeSplitVector)
+        break;
+      auto LT = getTypeLegalizationCost(RetTy);
+      return LT.first / 2;
+    } else {
       if (!getTLI()->shouldExpandGetActiveLaneMask(RetVT, OpVT) &&
           !getTLI()->isTypeLegal(RetVT)) {
         // We don't have enough context at this point to determine if the mask
@@ -972,7 +980,7 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
         // NOTE: getScalarizationOverhead returns a cost that's far too
         // pessimistic for the actual generated codegen. In reality there are
         // two instructions generated per lane.
-        return RetTy->getNumElements() * 2;
+        return cast<FixedVectorType>(RetTy)->getNumElements() * 2;
       }
     }
     break;
@@ -6146,8 +6154,11 @@ bool AArch64TTIImpl::preferPredicateOverEpilogue(TailFoldingInfo *TFI) const {
   if (Required == TailFoldingOpts::Disabled)
     Required |= TailFoldingOpts::Simple;
 
-  if (!TailFoldingOptionLoc.satisfies(ST->getSVETailFoldingDefaultOpts(),
-                                      Required))
+  TailFoldingOpts DefaultOpts = ST->getSVETailFoldingDefaultOpts();
+  if (TFI->UseWideLaneMask)
+    DefaultOpts |= TailFoldingOpts::Simple;
+
+  if (!TailFoldingOptionLoc.satisfies(DefaultOpts, Required))
     return false;
 
   // Don't tail-fold for tight loops where we would be better off interleaving
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -249,6 +249,10 @@ static cl::opt<TailFoldingStyle> ForceTailFoldingStyle(
                    "Use predicated EVL instructions for tail folding. If EVL "
                    "is unsupported, fallback to data-without-lane-mask.")));
 
+cl::opt<bool> llvm::EnableWideActiveLaneMask(
+    "enable-wide-lane-mask", cl::init(false), cl::Hidden,
+    cl::desc("Enable use of wide get active lane mask instructions"));
+
 static cl::opt<bool> MaximizeBandwidth(
     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
     cl::desc("Maximize bandwidth when selecting vectorization factor which "
@@ -1346,6 +1350,15 @@ class LoopVectorizationCostModel {
     return getTailFoldingStyle() != TailFoldingStyle::None;
   }
 
+  bool useWideActiveLaneMask() const {
+    if (!EnableWideActiveLaneMask)
+      return false;
+
+    TailFoldingStyle TF = getTailFoldingStyle();
+    return TF == TailFoldingStyle::DataAndControlFlow ||
+           TF == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
+  }
+
   /// Return maximum safe number of elements to be processed per vector
   /// iteration, which do not prevent store-load forwarding and are safe with
   /// regard to the memory dependencies. Required for EVL-based VPlans to
@@ -4518,7 +4531,7 @@ LoopVectorizationPlanner::selectInterleaveCount(VPlan &Plan, ElementCount VF,
   // 3. We don't interleave if we think that we will spill registers to memory
   // due to the increased register pressure.
 
-  if (!CM.isScalarEpilogueAllowed())
+  if (!CM.isScalarEpilogueAllowed() && !CM.useWideActiveLaneMask())
     return 1;
 
   if (any_of(Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis(),
@@ -8995,7 +9008,7 @@ static ScalarEpilogueLowering getScalarEpilogueLowering(
   };
 
   // 4) if the TTI hook indicates this is profitable, request predication.
-  TailFoldingInfo TFI(TLI, &LVL, IAI);
+  TailFoldingInfo TFI(TLI, &LVL, IAI, EnableWideActiveLaneMask);
   if (TTI->preferPredicateOverEpilogue(&TFI))
     return CM_ScalarEpilogueNotNeededUsePredicate;
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -40,10 +40,6 @@
 using namespace llvm;
 using namespace VPlanPatternMatch;
 
-static cl::opt<bool> EnableWideActiveLaneMask(
-    "enable-wide-lane-mask", cl::init(false), cl::Hidden,
-    cl::desc("Enable use of wide get active lane mask instructions"));
-
 bool VPlanTransforms::tryToConvertVPInstructionsToVPRecipes(
     VPlan &Plan,
     function_ref<const InductionDescriptor *(PHINode *)>
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -32,6 +32,7 @@ class VPRecipeBuilder;
 struct VFRange;
 
 extern cl::opt<bool> VerifyEachVPlan;
+extern cl::opt<bool> EnableWideActiveLaneMask;
 
 struct VPlanTransforms {
   /// Helper to run a VPlan transform \p Transform on \p VPlan, forwarding extra
diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll b/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-wide-lane-mask.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-wide-lane-mask.ll