-
Notifications
You must be signed in to change notification settings - Fork 14.9k
[LV] Consider interleaving & tail-folding when -enable-wide-lane-mask=true #163387
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -957,10 +957,18 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, | |
return TyL.first + ExtraCost; | ||
} | ||
case Intrinsic::get_active_lane_mask: { | ||
auto *RetTy = dyn_cast<FixedVectorType>(ICA.getReturnType()); | ||
if (RetTy) { | ||
EVT RetVT = getTLI()->getValueType(DL, RetTy); | ||
EVT OpVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]); | ||
auto RetTy = cast<VectorType>(ICA.getReturnType()); | ||
EVT RetVT = getTLI()->getValueType(DL, RetTy); | ||
EVT OpVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]); | ||
if (RetTy->isScalableTy()) { | ||
if (getTLI()->shouldExpandGetActiveLaneMask(RetVT, OpVT) || | ||
(!ST->hasSVE2p1() && !ST->hasSME2()) || | ||
TLI->getTypeAction(RetTy->getContext(), RetVT) != | ||
TargetLowering::TypeSplitVector) | ||
break; | ||
auto LT = getTypeLegalizationCost(RetTy); | ||
return LT.first / 2; | ||
kmclaughlin-arm marked this conversation as resolved.
Show resolved
Hide resolved
|
||
} else { | ||
if (!getTLI()->shouldExpandGetActiveLaneMask(RetVT, OpVT) && | ||
!getTLI()->isTypeLegal(RetVT)) { | ||
// We don't have enough context at this point to determine if the mask | ||
|
@@ -972,7 +980,7 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, | |
// NOTE: getScalarizationOverhead returns a cost that's far too | ||
// pessimistic for the actual generated codegen. In reality there are | ||
// two instructions generated per lane. | ||
return RetTy->getNumElements() * 2; | ||
return cast<FixedVectorType>(RetTy)->getNumElements() * 2; | ||
} | ||
} | ||
break; | ||
|
@@ -6146,8 +6154,11 @@ bool AArch64TTIImpl::preferPredicateOverEpilogue(TailFoldingInfo *TFI) const { | |
if (Required == TailFoldingOpts::Disabled) | ||
Required |= TailFoldingOpts::Simple; | ||
|
||
if (!TailFoldingOptionLoc.satisfies(ST->getSVETailFoldingDefaultOpts(), | ||
Required)) | ||
TailFoldingOpts DefaultOpts = ST->getSVETailFoldingDefaultOpts(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not sure about this behaviour to be honest. Enabling the use of wide lane masks itself doesn't automatically imply that we should tail-fold all simple loops. I see two issues here:
If you want a way to force simple tail-folding with wide lane masks it might be better to use a target flag that lives in this file. For example, you could add a new option to You could rename the loop vectoriser flag
or something like that? |
||
if (TFI->UseWideLaneMask) | ||
DefaultOpts |= TailFoldingOpts::Simple; | ||
|
||
if (!TailFoldingOptionLoc.satisfies(DefaultOpts, Required)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not sure it's a good idea to ignore the instruction threshold below because the problem that exists for normal tail-folding will also exist for wide lane masks. If the user really wants to test the special case of tail-folding for small loops they can always do it in conjunction with -sve-tail-folding-insn-threshold=0. |
||
return false; | ||
|
||
// Don't tail-fold for tight loops where we would be better off interleaving | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -249,6 +249,10 @@ static cl::opt<TailFoldingStyle> ForceTailFoldingStyle( | |
"Use predicated EVL instructions for tail folding. If EVL " | ||
"is unsupported, fallback to data-without-lane-mask."))); | ||
|
||
cl::opt<bool> llvm::EnableWideActiveLaneMask( | ||
"enable-wide-lane-mask", cl::init(false), cl::Hidden, | ||
cl::desc("Enable use of wide get active lane mask instructions")); | ||
|
||
static cl::opt<bool> MaximizeBandwidth( | ||
"vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, | ||
cl::desc("Maximize bandwidth when selecting vectorization factor which " | ||
|
@@ -1346,6 +1350,15 @@ class LoopVectorizationCostModel { | |
return getTailFoldingStyle() != TailFoldingStyle::None; | ||
} | ||
|
||
bool useWideActiveLaneMask() const { | ||
if (!EnableWideActiveLaneMask) | ||
return false; | ||
|
||
TailFoldingStyle TF = getTailFoldingStyle(); | ||
return TF == TailFoldingStyle::DataAndControlFlow || | ||
TF == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck; | ||
} | ||
|
||
/// Return maximum safe number of elements to be processed per vector | ||
/// iteration, which do not prevent store-load forwarding and are safe with | ||
/// regard to the memory dependencies. Required for EVL-based VPlans to | ||
|
@@ -4518,7 +4531,7 @@ LoopVectorizationPlanner::selectInterleaveCount(VPlan &Plan, ElementCount VF, | |
// 3. We don't interleave if we think that we will spill registers to memory | ||
// due to the increased register pressure. | ||
|
||
if (!CM.isScalarEpilogueAllowed()) | ||
if (!CM.isScalarEpilogueAllowed() && !CM.useWideActiveLaneMask()) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This deserves some explanation, why |
||
return 1; | ||
|
||
if (any_of(Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis(), | ||
|
@@ -8995,7 +9008,7 @@ static ScalarEpilogueLowering getScalarEpilogueLowering( | |
}; | ||
|
||
// 4) if the TTI hook indicates this is profitable, request predication. | ||
TailFoldingInfo TFI(TLI, &LVL, IAI); | ||
TailFoldingInfo TFI(TLI, &LVL, IAI, EnableWideActiveLaneMask); | ||
if (TTI->preferPredicateOverEpilogue(&TFI)) | ||
return CM_ScalarEpilogueNotNeededUsePredicate; | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This flag is being set by a loop vectoriser flag called 'EnableWideLaneMask', which to me isn't the same as 'UseWideLaneMask'. The latter makes it sound like a decision has already been made, whereas the former sounds more like a possibility if the target wishes to use them.