Skip to content

Commit 8fa2cc5

Browse files
[LV] Consider interleaving & tail-folding when -enable-wide-lane-mask=true
Currently the only way to enable the use of wide active lane masks is to pass -enable-wide-lane-mask and force both interleaving & tail-folding with additional flags. This patch changes selectInterleaveCount & preferPredicateOverEpilogue to consider both interleaving and tail-folding if wide lane masks were requested, although the feature remains off by default. Basic cost model changes are also included which reduce the cost of the get.active.lane.mask intrinsic when the return type would require splitting, but we know the whilelo (predicate pair) instruction can be used.
1 parent 32fffe5 commit 8fa2cc5

File tree

7 files changed

+186
-38
lines changed

7 files changed

+186
-38
lines changed

llvm/include/llvm/Analysis/TargetTransformInfo.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -209,9 +209,10 @@ struct TailFoldingInfo {
209209
TargetLibraryInfo *TLI;
210210
LoopVectorizationLegality *LVL;
211211
InterleavedAccessInfo *IAI;
212+
bool UseWideLaneMask;
212213
TailFoldingInfo(TargetLibraryInfo *TLI, LoopVectorizationLegality *LVL,
213-
InterleavedAccessInfo *IAI)
214-
: TLI(TLI), LVL(LVL), IAI(IAI) {}
214+
InterleavedAccessInfo *IAI, bool UseWideLaneMask = false)
215+
: TLI(TLI), LVL(LVL), IAI(IAI), UseWideLaneMask(UseWideLaneMask) {}
215216
};
216217

217218
class TargetTransformInfo;

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

Lines changed: 18 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -957,10 +957,18 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
957957
return TyL.first + ExtraCost;
958958
}
959959
case Intrinsic::get_active_lane_mask: {
960-
auto *RetTy = dyn_cast<FixedVectorType>(ICA.getReturnType());
961-
if (RetTy) {
962-
EVT RetVT = getTLI()->getValueType(DL, RetTy);
963-
EVT OpVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
960+
auto RetTy = cast<VectorType>(ICA.getReturnType());
961+
EVT RetVT = getTLI()->getValueType(DL, RetTy);
962+
EVT OpVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
963+
if (RetTy->isScalableTy()) {
964+
if (getTLI()->shouldExpandGetActiveLaneMask(RetVT, OpVT) ||
965+
(!ST->hasSVE2p1() && !ST->hasSME2()) ||
966+
TLI->getTypeAction(RetTy->getContext(), RetVT) !=
967+
TargetLowering::TypeSplitVector)
968+
break;
969+
auto LT = getTypeLegalizationCost(RetTy);
970+
return LT.first / 2;
971+
} else {
964972
if (!getTLI()->shouldExpandGetActiveLaneMask(RetVT, OpVT) &&
965973
!getTLI()->isTypeLegal(RetVT)) {
966974
// We don't have enough context at this point to determine if the mask
@@ -972,7 +980,7 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
972980
// NOTE: getScalarizationOverhead returns a cost that's far too
973981
// pessimistic for the actual generated codegen. In reality there are
974982
// two instructions generated per lane.
975-
return RetTy->getNumElements() * 2;
983+
return cast<FixedVectorType>(RetTy)->getNumElements() * 2;
976984
}
977985
}
978986
break;
@@ -6146,8 +6154,11 @@ bool AArch64TTIImpl::preferPredicateOverEpilogue(TailFoldingInfo *TFI) const {
61466154
if (Required == TailFoldingOpts::Disabled)
61476155
Required |= TailFoldingOpts::Simple;
61486156

6149-
if (!TailFoldingOptionLoc.satisfies(ST->getSVETailFoldingDefaultOpts(),
6150-
Required))
6157+
TailFoldingOpts DefaultOpts = ST->getSVETailFoldingDefaultOpts();
6158+
if (TFI->UseWideLaneMask)
6159+
DefaultOpts |= TailFoldingOpts::Simple;
6160+
6161+
if (!TailFoldingOptionLoc.satisfies(DefaultOpts, Required))
61516162
return false;
61526163

61536164
// Don't tail-fold for tight loops where we would be better off interleaving

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -249,6 +249,10 @@ static cl::opt<TailFoldingStyle> ForceTailFoldingStyle(
249249
"Use predicated EVL instructions for tail folding. If EVL "
250250
"is unsupported, fallback to data-without-lane-mask.")));
251251

252+
cl::opt<bool> llvm::EnableWideActiveLaneMask(
253+
"enable-wide-lane-mask", cl::init(false), cl::Hidden,
254+
cl::desc("Enable use of wide get active lane mask instructions"));
255+
252256
static cl::opt<bool> MaximizeBandwidth(
253257
"vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
254258
cl::desc("Maximize bandwidth when selecting vectorization factor which "
@@ -1346,6 +1350,15 @@ class LoopVectorizationCostModel {
13461350
return getTailFoldingStyle() != TailFoldingStyle::None;
13471351
}
13481352

1353+
bool useWideActiveLaneMask() const {
1354+
if (!EnableWideActiveLaneMask)
1355+
return false;
1356+
1357+
TailFoldingStyle TF = getTailFoldingStyle();
1358+
return TF == TailFoldingStyle::DataAndControlFlow ||
1359+
TF == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
1360+
}
1361+
13491362
/// Return maximum safe number of elements to be processed per vector
13501363
/// iteration, which do not prevent store-load forwarding and are safe with
13511364
/// regard to the memory dependencies. Required for EVL-based VPlans to
@@ -4518,7 +4531,7 @@ LoopVectorizationPlanner::selectInterleaveCount(VPlan &Plan, ElementCount VF,
45184531
// 3. We don't interleave if we think that we will spill registers to memory
45194532
// due to the increased register pressure.
45204533

4521-
if (!CM.isScalarEpilogueAllowed())
4534+
if (!CM.isScalarEpilogueAllowed() && !CM.useWideActiveLaneMask())
45224535
return 1;
45234536

45244537
if (any_of(Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis(),
@@ -8995,7 +9008,7 @@ static ScalarEpilogueLowering getScalarEpilogueLowering(
89959008
};
89969009

89979010
// 4) if the TTI hook indicates this is profitable, request predication.
8998-
TailFoldingInfo TFI(TLI, &LVL, IAI);
9011+
TailFoldingInfo TFI(TLI, &LVL, IAI, EnableWideActiveLaneMask);
89999012
if (TTI->preferPredicateOverEpilogue(&TFI))
90009013
return CM_ScalarEpilogueNotNeededUsePredicate;
90019014

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -40,10 +40,6 @@
4040
using namespace llvm;
4141
using namespace VPlanPatternMatch;
4242

43-
static cl::opt<bool> EnableWideActiveLaneMask(
44-
"enable-wide-lane-mask", cl::init(false), cl::Hidden,
45-
cl::desc("Enable use of wide get active lane mask instructions"));
46-
4743
bool VPlanTransforms::tryToConvertVPInstructionsToVPRecipes(
4844
VPlan &Plan,
4945
function_ref<const InductionDescriptor *(PHINode *)>

llvm/lib/Transforms/Vectorize/VPlanTransforms.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ class VPRecipeBuilder;
3232
struct VFRange;
3333

3434
extern cl::opt<bool> VerifyEachVPlan;
35+
extern cl::opt<bool> EnableWideActiveLaneMask;
3536

3637
struct VPlanTransforms {
3738
/// Helper to run a VPlan transform \p Transform on \p VPlan, forwarding extra

0 commit comments

Comments
 (0)