Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions llvm/include/llvm/Analysis/TargetTransformInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -209,9 +209,10 @@ struct TailFoldingInfo {
TargetLibraryInfo *TLI;
LoopVectorizationLegality *LVL;
InterleavedAccessInfo *IAI;
bool UseWideLaneMask;
TailFoldingInfo(TargetLibraryInfo *TLI, LoopVectorizationLegality *LVL,
InterleavedAccessInfo *IAI)
: TLI(TLI), LVL(LVL), IAI(IAI) {}
InterleavedAccessInfo *IAI, bool UseWideLaneMask = false)
: TLI(TLI), LVL(LVL), IAI(IAI), UseWideLaneMask(UseWideLaneMask) {}
};

class TargetTransformInfo;
Expand Down
25 changes: 18 additions & 7 deletions llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -957,10 +957,18 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
return TyL.first + ExtraCost;
}
case Intrinsic::get_active_lane_mask: {
auto *RetTy = dyn_cast<FixedVectorType>(ICA.getReturnType());
if (RetTy) {
EVT RetVT = getTLI()->getValueType(DL, RetTy);
EVT OpVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
auto RetTy = cast<VectorType>(ICA.getReturnType());
EVT RetVT = getTLI()->getValueType(DL, RetTy);
EVT OpVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
if (RetTy->isScalableTy()) {
if (getTLI()->shouldExpandGetActiveLaneMask(RetVT, OpVT) ||
(!ST->hasSVE2p1() && !ST->hasSME2()) ||
TLI->getTypeAction(RetTy->getContext(), RetVT) !=
TargetLowering::TypeSplitVector)
break;
auto LT = getTypeLegalizationCost(RetTy);
return LT.first / 2;
} else {
if (!getTLI()->shouldExpandGetActiveLaneMask(RetVT, OpVT) &&
!getTLI()->isTypeLegal(RetVT)) {
// We don't have enough context at this point to determine if the mask
Expand All @@ -972,7 +980,7 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
// NOTE: getScalarizationOverhead returns a cost that's far too
// pessimistic for the actual generated codegen. In reality there are
// two instructions generated per lane.
return RetTy->getNumElements() * 2;
return cast<FixedVectorType>(RetTy)->getNumElements() * 2;
}
}
break;
Expand Down Expand Up @@ -6146,8 +6154,11 @@ bool AArch64TTIImpl::preferPredicateOverEpilogue(TailFoldingInfo *TFI) const {
if (Required == TailFoldingOpts::Disabled)
Required |= TailFoldingOpts::Simple;

if (!TailFoldingOptionLoc.satisfies(ST->getSVETailFoldingDefaultOpts(),
Required))
TailFoldingOpts DefaultOpts = ST->getSVETailFoldingDefaultOpts();
if (TFI->UseWideLaneMask)
DefaultOpts |= TailFoldingOpts::Simple;

if (!TailFoldingOptionLoc.satisfies(DefaultOpts, Required))
return false;

// Don't tail-fold for tight loops where we would be better off interleaving
Expand Down
17 changes: 15 additions & 2 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -249,6 +249,10 @@ static cl::opt<TailFoldingStyle> ForceTailFoldingStyle(
"Use predicated EVL instructions for tail folding. If EVL "
"is unsupported, fallback to data-without-lane-mask.")));

cl::opt<bool> llvm::EnableWideActiveLaneMask(
"enable-wide-lane-mask", cl::init(false), cl::Hidden,
cl::desc("Enable use of wide get active lane mask instructions"));

static cl::opt<bool> MaximizeBandwidth(
"vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
cl::desc("Maximize bandwidth when selecting vectorization factor which "
Expand Down Expand Up @@ -1346,6 +1350,15 @@ class LoopVectorizationCostModel {
return getTailFoldingStyle() != TailFoldingStyle::None;
}

bool useWideActiveLaneMask() const {
if (!EnableWideActiveLaneMask)
return false;

TailFoldingStyle TF = getTailFoldingStyle();
return TF == TailFoldingStyle::DataAndControlFlow ||
TF == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
}

/// Return maximum safe number of elements to be processed per vector
/// iteration, which do not prevent store-load forwarding and are safe with
/// regard to the memory dependencies. Required for EVL-based VPlans to
Expand Down Expand Up @@ -4518,7 +4531,7 @@ LoopVectorizationPlanner::selectInterleaveCount(VPlan &Plan, ElementCount VF,
// 3. We don't interleave if we think that we will spill registers to memory
// due to the increased register pressure.

if (!CM.isScalarEpilogueAllowed())
if (!CM.isScalarEpilogueAllowed() && !CM.useWideActiveLaneMask())
return 1;

if (any_of(Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis(),
Expand Down Expand Up @@ -8995,7 +9008,7 @@ static ScalarEpilogueLowering getScalarEpilogueLowering(
};

// 4) if the TTI hook indicates this is profitable, request predication.
TailFoldingInfo TFI(TLI, &LVL, IAI);
TailFoldingInfo TFI(TLI, &LVL, IAI, EnableWideActiveLaneMask);
if (TTI->preferPredicateOverEpilogue(&TFI))
return CM_ScalarEpilogueNotNeededUsePredicate;

Expand Down
4 changes: 0 additions & 4 deletions llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,10 +40,6 @@
using namespace llvm;
using namespace VPlanPatternMatch;

static cl::opt<bool> EnableWideActiveLaneMask(
"enable-wide-lane-mask", cl::init(false), cl::Hidden,
cl::desc("Enable use of wide get active lane mask instructions"));

bool VPlanTransforms::tryToConvertVPInstructionsToVPRecipes(
VPlan &Plan,
function_ref<const InductionDescriptor *(PHINode *)>
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Transforms/Vectorize/VPlanTransforms.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ class VPRecipeBuilder;
struct VFRange;

extern cl::opt<bool> VerifyEachVPlan;
extern cl::opt<bool> EnableWideActiveLaneMask;

struct VPlanTransforms {
/// Helper to run a VPlan transform \p Transform on \p VPlan, forwarding extra
Expand Down
Loading