Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions llvm/include/llvm/Analysis/TargetTransformInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -1231,6 +1231,13 @@ class TargetTransformInfo {
LLVM_ABI bool
shouldMaximizeVectorBandwidth(TargetTransformInfo::RegisterKind K) const;

/// \return True if vectorization factors wider than those matching the
/// largest element type should be chosen conservatively. This only makes
/// sense when shouldMaximizeVectorBandwidth returns true.
/// \p K Register Kind for vectorization.
LLVM_ABI bool shouldMaximizeVectorBandwidthConservatively(
TargetTransformInfo::RegisterKind K) const;

/// \return The minimum vectorization factor for types of given element
/// bit width, or 0 if there is no minimum VF. The returned value only
/// applies when shouldMaximizeVectorBandwidth returns true.
Expand Down
5 changes: 5 additions & 0 deletions llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -597,6 +597,11 @@ class TargetTransformInfoImplBase {
return false;
}

virtual bool shouldMaximizeVectorBandwidthConservatively(
TargetTransformInfo::RegisterKind K) const {
return false;
}

virtual ElementCount getMinimumVF(unsigned ElemWidth, bool IsScalable) const {
return ElementCount::get(0, IsScalable);
}
Expand Down
5 changes: 5 additions & 0 deletions llvm/lib/Analysis/TargetTransformInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -803,6 +803,11 @@ bool TargetTransformInfo::shouldMaximizeVectorBandwidth(
return TTIImpl->shouldMaximizeVectorBandwidth(K);
}

bool TargetTransformInfo::shouldMaximizeVectorBandwidthConservatively(
TargetTransformInfo::RegisterKind K) const {
return TTIImpl->shouldMaximizeVectorBandwidthConservatively(K);
}

ElementCount TargetTransformInfo::getMinimumVF(unsigned ElemWidth,
bool IsScalable) const {
return TTIImpl->getMinimumVF(ElemWidth, IsScalable);
Expand Down
7 changes: 6 additions & 1 deletion llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,9 @@ static cl::opt<unsigned> DMBLookaheadThreshold(
"dmb-lookahead-threshold", cl::init(10), cl::Hidden,
cl::desc("The number of instructions to search for a redundant dmb"));

static cl::opt<bool> EnableSVEMaximizeVecBW("enable-sve-maximize-vec-bw",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure why we need a new flag to force enabling the max bandwidth, since one already exists in LoopVectorize.cpp - vectorizer-maximize-bandwidth.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you. I did not intend to include this part in this patch. Removed.

cl::init(false), cl::Hidden);

namespace {
class TailFoldingOption {
// These bitfields will only ever be set to something non-zero in operator=,
Expand Down Expand Up @@ -370,7 +373,9 @@ bool AArch64TTIImpl::shouldMaximizeVectorBandwidth(
TargetTransformInfo::RegisterKind K) const {
assert(K != TargetTransformInfo::RGK_Scalar);
return (K == TargetTransformInfo::RGK_FixedWidthVector &&
ST->isNeonAvailable());
ST->isNeonAvailable()) ||
(EnableSVEMaximizeVecBW &&
K == TargetTransformInfo::RGK_ScalableVector && ST->isSVEAvailable());
}

/// Calculate the cost of materializing a 64-bit value. This helper
Expand Down
3 changes: 2 additions & 1 deletion llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
Original file line number Diff line number Diff line change
Expand Up @@ -474,7 +474,8 @@ class LoopVectorizationPlanner {
///
/// TODO: Move to VPlan::cost once the use of LoopVectorizationLegality has
/// been retired.
InstructionCost cost(VPlan &Plan, ElementCount VF) const;
InstructionCost cost(VPlan &Plan, ElementCount VF,
bool CountsVecCalcOnly = false) const;

/// Precompute costs for certain instructions using the legacy cost model. The
/// function is used to bring up the VPlan-based cost model to initially avoid
Expand Down
77 changes: 63 additions & 14 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -263,6 +263,11 @@ static cl::opt<bool> MaximizeBandwidth(
cl::desc("Maximize bandwidth when selecting vectorization factor which "
"will be determined by the smallest type in loop."));

static cl::opt<bool> MaximizeBandwidthConservatively(
"vectorizer-maximize-bandwidth-conservatively", cl::init(false), cl::Hidden,
cl::desc("When MaximizeBandwidth is enabled, a larger vector factor is "
"chosen conservatively."));

static cl::opt<bool> EnableInterleavedMemAccesses(
"enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
Expand Down Expand Up @@ -962,9 +967,16 @@ class LoopVectorizationCostModel {
/// user options, for the given register kind.
bool useMaxBandwidth(TargetTransformInfo::RegisterKind RegKind);

/// \return True if maximizing vector bandwidth should be applied
/// conservatively by the target or user options, for the given register kind.
/// This only makes sense when useMaxBandwidth returns true.
bool useMaxBandwidthConservatively(TargetTransformInfo::RegisterKind RegKind);

/// \return True if register pressure should be calculated for the given VF.
bool shouldCalculateRegPressureForVF(ElementCount VF);

bool isVFForMaxBandwidth(ElementCount VF);

/// \return The size (in bits) of the smallest and widest types in the code
/// that needs to be vectorized. We ignore values that remain scalar such as
/// 64 bit loop indices.
Expand Down Expand Up @@ -3812,11 +3824,15 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {

bool LoopVectorizationCostModel::shouldCalculateRegPressureForVF(
ElementCount VF) {
// Only calculate register pressure for VFs enabled by MaxBandwidth.
return isVFForMaxBandwidth(VF);
}

bool LoopVectorizationCostModel::isVFForMaxBandwidth(ElementCount VF) {
if (!useMaxBandwidth(VF.isScalable()
? TargetTransformInfo::RGK_ScalableVector
: TargetTransformInfo::RGK_FixedWidthVector))
return false;
// Only calculate register pressure for VFs enabled by MaxBandwidth.
return ElementCount::isKnownGT(
VF, VF.isScalable() ? MaxPermissibleVFWithoutMaxBW.ScalableVF
: MaxPermissibleVFWithoutMaxBW.FixedVF);
Expand All @@ -3830,6 +3846,13 @@ bool LoopVectorizationCostModel::useMaxBandwidth(
Legal->hasVectorCallVariants())));
}

bool LoopVectorizationCostModel::useMaxBandwidthConservatively(
TargetTransformInfo::RegisterKind RegKind) {
return MaximizeBandwidthConservatively ||
(MaximizeBandwidthConservatively.getNumOccurrences() == 0 &&
TTI.shouldMaximizeVectorBandwidthConservatively(RegKind));
}

ElementCount LoopVectorizationCostModel::clampVFByMaxTripCount(
ElementCount VF, unsigned MaxTripCount, bool FoldTailByMasking) const {
unsigned EstimatedVF = VF.getKnownMinValue();
Expand Down Expand Up @@ -6923,13 +6946,16 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
return Cost;
}

InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,
ElementCount VF) const {
InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan, ElementCount VF,
bool CountsVecCalcOnly) const {
VPCostContext CostCtx(CM.TTI, *CM.TLI, Plan, CM, CM.CostKind);
InstructionCost Cost = precomputeCosts(Plan, VF, CostCtx);
InstructionCost Cost;

if (!CountsVecCalcOnly)
Cost += precomputeCosts(Plan, VF, CostCtx);

// Now compute and add the VPlan-based cost.
Cost += Plan.cost(VF, CostCtx);
Cost += Plan.cost(VF, CostCtx, CountsVecCalcOnly);
#ifndef NDEBUG
unsigned EstimatedWidth = estimateElementCount(VF, CM.getVScaleForTuning());
LLVM_DEBUG(dbgs() << "Cost for VF " << VF << ": " << Cost
Expand Down Expand Up @@ -7105,8 +7131,25 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
continue;
}

if (isMoreProfitable(CurrentFactor, BestFactor, P->hasScalarTail()))
BestFactor = CurrentFactor;
if (isMoreProfitable(CurrentFactor, BestFactor, P->hasScalarTail())) {
if (CM.isVFForMaxBandwidth(VF) &&
CM.useMaxBandwidthConservatively(
VF.isScalable() ? TargetTransformInfo::RGK_ScalableVector
: TargetTransformInfo::RGK_FixedWidthVector)) {
if (ElementCount::isKnownLT(BestFactor.Width, VF) &&
llvm::find(VFs, BestFactor.Width)) {
VectorizationFactor BestFactorVecCalc(
BestFactor.Width, cost(*P, BestFactor.Width, true), ScalarCost);
VectorizationFactor CurrentFactorVecCalc(VF, cost(*P, VF, true),
ScalarCost);
if (isMoreProfitable(CurrentFactorVecCalc, BestFactorVecCalc,
P->hasScalarTail()))
BestFactor = CurrentFactor;
}
} else {
BestFactor = CurrentFactor;
}
}

// If profitable add it to ProfitableVF list.
if (isMoreProfitable(CurrentFactor, ScalarFactor, P->hasScalarTail()))
Expand All @@ -7131,13 +7174,19 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
// Verify that the VPlan-based and legacy cost models agree, except for VPlans
// with early exits and plans with additional VPlan simplifications. The
// legacy cost model doesn't properly model costs for such loops.
assert((BestFactor.Width == LegacyVF.Width || BestPlan.hasEarlyExit() ||
planContainsAdditionalSimplifications(getPlanFor(BestFactor.Width),
CostCtx, OrigLoop,
BestFactor.Width) ||
planContainsAdditionalSimplifications(
getPlanFor(LegacyVF.Width), CostCtx, OrigLoop, LegacyVF.Width)) &&
" VPlan cost model and legacy cost model disagreed");
if (!CM.isVFForMaxBandwidth(LegacyVF.Width) ||
!CM.useMaxBandwidthConservatively(
LegacyVF.Width.isScalable()
? TargetTransformInfo::RGK_ScalableVector
: TargetTransformInfo::RGK_FixedWidthVector))
assert((BestFactor.Width == LegacyVF.Width || BestPlan.hasEarlyExit() ||
planContainsAdditionalSimplifications(getPlanFor(BestFactor.Width),
CostCtx, OrigLoop,
BestFactor.Width) ||
planContainsAdditionalSimplifications(getPlanFor(LegacyVF.Width),
CostCtx, OrigLoop,
LegacyVF.Width)) &&
" VPlan cost model and legacy cost model disagreed");
assert((BestFactor.Width.isScalar() || BestFactor.ScalarCost > 0) &&
"when vectorizing, the scalar cost must be computed.");
#endif
Expand Down
54 changes: 41 additions & 13 deletions llvm/lib/Transforms/Vectorize/VPlan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -800,10 +800,34 @@ void VPRegionBlock::execute(VPTransformState *State) {
State->Lane.reset();
}

InstructionCost VPBasicBlock::cost(ElementCount VF, VPCostContext &Ctx) {
InstructionCost VPBasicBlock::cost(ElementCount VF, VPCostContext &Ctx,
bool CountsVecCalcOnly) {
InstructionCost Cost = 0;
for (VPRecipeBase &R : Recipes)
Cost += R.cost(VF, Ctx);
for (VPRecipeBase &R : Recipes) {
if (!CountsVecCalcOnly)
Cost += R.cost(VF, Ctx);
else {
switch (R.getVPDefID()) {
case VPDef::VPActiveLaneMaskPHISC:
case VPDef::VPBlendSC:
case VPDef::VPFirstOrderRecurrencePHISC:
case VPDef::VPPartialReductionSC:
case VPDef::VPReductionPHISC:
case VPDef::VPReductionSC:
case VPDef::VPWidenCallSC:
case VPDef::VPWidenCanonicalIVSC:
case VPDef::VPWidenCastSC:
case VPDef::VPWidenGEPSC:
case VPDef::VPWidenIntOrFpInductionSC:
case VPDef::VPWidenIntrinsicSC:
case VPDef::VPWidenPHISC:
case VPDef::VPWidenPointerInductionSC:
case VPDef::VPWidenSC:
case VPDef::VPWidenSelectSC:
Cost += R.cost(VF, Ctx);
}
}
}
return Cost;
}

Expand All @@ -826,11 +850,12 @@ const VPBasicBlock *VPBasicBlock::getCFGPredecessor(unsigned Idx) const {
return Pred->getExitingBasicBlock();
}

InstructionCost VPRegionBlock::cost(ElementCount VF, VPCostContext &Ctx) {
InstructionCost VPRegionBlock::cost(ElementCount VF, VPCostContext &Ctx,
bool CountsVecCalcOnly) {
if (!isReplicator()) {
InstructionCost Cost = 0;
for (VPBlockBase *Block : vp_depth_first_shallow(getEntry()))
Cost += Block->cost(VF, Ctx);
Cost += Block->cost(VF, Ctx, CountsVecCalcOnly);
InstructionCost BackedgeCost =
ForceTargetInstructionCost.getNumOccurrences()
? InstructionCost(ForceTargetInstructionCost.getNumOccurrences())
Expand All @@ -853,7 +878,7 @@ InstructionCost VPRegionBlock::cost(ElementCount VF, VPCostContext &Ctx) {
// uniform condition.
using namespace llvm::VPlanPatternMatch;
VPBasicBlock *Then = cast<VPBasicBlock>(getEntry()->getSuccessors()[0]);
InstructionCost ThenCost = Then->cost(VF, Ctx);
InstructionCost ThenCost = Then->cost(VF, Ctx, CountsVecCalcOnly);

// For the scalar case, we may not always execute the original predicated
// block, Thus, scale the block's cost by the probability of executing it.
Expand Down Expand Up @@ -1016,19 +1041,22 @@ void VPlan::execute(VPTransformState *State) {
}
}

InstructionCost VPlan::cost(ElementCount VF, VPCostContext &Ctx) {
InstructionCost VPlan::cost(ElementCount VF, VPCostContext &Ctx,
bool CountsVecCalcOnly) {
// For now only return the cost of the vector loop region, ignoring any other
// blocks, like the preheader or middle blocks, expect for checking them for
// recipes with invalid costs.
InstructionCost Cost = getVectorLoopRegion()->cost(VF, Ctx);
InstructionCost Cost =
getVectorLoopRegion()->cost(VF, Ctx, CountsVecCalcOnly);

// If the cost of the loop region is invalid or any recipe in the skeleton
// outside loop regions are invalid return an invalid cost.
if (!Cost.isValid() || any_of(VPBlockUtils::blocksOnly<VPBasicBlock>(
vp_depth_first_shallow(getEntry())),
[&VF, &Ctx](VPBasicBlock *VPBB) {
return !VPBB->cost(VF, Ctx).isValid();
}))
if (!Cost.isValid() ||
any_of(VPBlockUtils::blocksOnly<VPBasicBlock>(
vp_depth_first_shallow(getEntry())),
[&VF, &Ctx, &CountsVecCalcOnly](VPBasicBlock *VPBB) {
return !VPBB->cost(VF, Ctx, CountsVecCalcOnly).isValid();
}))
return InstructionCost::getInvalid();

return Cost;
Expand Down
12 changes: 8 additions & 4 deletions llvm/lib/Transforms/Vectorize/VPlan.h
Original file line number Diff line number Diff line change
Expand Up @@ -340,7 +340,8 @@ class LLVM_ABI_FOR_TEST VPBlockBase {
virtual void execute(VPTransformState *State) = 0;

/// Return the cost of the block.
virtual InstructionCost cost(ElementCount VF, VPCostContext &Ctx) = 0;
virtual InstructionCost cost(ElementCount VF, VPCostContext &Ctx,
bool CountsVecCalcOnly = false) = 0;

/// Return true if it is legal to hoist instructions into this block.
bool isLegalToHoistInto() {
Expand Down Expand Up @@ -3716,7 +3717,8 @@ class LLVM_ABI_FOR_TEST VPBasicBlock : public VPBlockBase {
void execute(VPTransformState *State) override;

/// Return the cost of this VPBasicBlock.
InstructionCost cost(ElementCount VF, VPCostContext &Ctx) override;
InstructionCost cost(ElementCount VF, VPCostContext &Ctx,
bool CountsVecCalcOnly) override;

/// Return the position of the first non-phi node recipe in the block.
iterator getFirstNonPhi();
Expand Down Expand Up @@ -3897,7 +3899,8 @@ class LLVM_ABI_FOR_TEST VPRegionBlock : public VPBlockBase {
void execute(VPTransformState *State) override;

// Return the cost of this region.
InstructionCost cost(ElementCount VF, VPCostContext &Ctx) override;
InstructionCost cost(ElementCount VF, VPCostContext &Ctx,
bool CountsVecCalcOnly) override;

#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
/// Print this VPRegionBlock to \p O (recursively), prefixing all lines with
Expand Down Expand Up @@ -4022,7 +4025,8 @@ class VPlan {
void execute(VPTransformState *State);

/// Return the cost of this plan.
InstructionCost cost(ElementCount VF, VPCostContext &Ctx);
InstructionCost cost(ElementCount VF, VPCostContext &Ctx,
bool CountsVecCalcOnly = false);

VPBasicBlock *getEntry() { return Entry; }
const VPBasicBlock *getEntry() const { return Entry; }
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
; REQUIRES: asserts
; RUN: opt < %s -mtriple aarch64-linux-gnu -mattr=+sve -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -debug-only=loop-vectorize 2>&1 | FileCheck %s
; RUN: opt < %s -mtriple aarch64-linux-gnu -mattr=+sve -passes=loop-vectorize -vectorizer-maximize-bandwidth -vectorizer-maximize-bandwidth-conservatively -S -debug-only=loop-vectorize 2>&1 | FileCheck %s --check-prefix=CHECK-CONS

define void @f(i32 %n, ptr noalias %a, ptr %b, ptr %c) {
; The following loop is an example where choosing a larger vector width reduces
; the number of instructions but may lead to performance degradation due to the
; FP pipeline becoming a bottleneck.
;
; void f(int n, short *restrict a, long *b, double *c) {
; for (int i = 0; i < n; i++) {
; a[i] = b[i] + c[i];
; }
; }

; In the usual cost model, vscale x 8 is chosen.
; CHECK: Cost for VF vscale x 2: 8 (Estimated cost per lane: 4.0)
; CHECK: Cost for VF vscale x 4: 14 (Estimated cost per lane: 3.5)
; CHECK: Cost for VF vscale x 8: 26 (Estimated cost per lane: 3.2)
; CHECK: LV: Selecting VF: vscale x 8.

; In a conservative cost model, a larger vector width is chosen only if it is
; superior when compared solely based on the cost of the FP pipeline, in
; addition to the usual model.
; CHECK-CONS: Cost for VF vscale x 2: 3 (Estimated cost per lane: 1.5)
; CHECK-CONS: Cost for VF vscale x 4: 7 (Estimated cost per lane: 1.8)
; CHECK-CONS: Cost for VF vscale x 8: 15 (Estimated cost per lane: 1.9)
; CHECK-CONS: LV: Selecting VF: vscale x 2.

entry:
%cmp10 = icmp sgt i32 %n, 0
br i1 %cmp10, label %for.body.preheader, label %for.cond.cleanup

for.body.preheader: ; preds = %entry
%wide.trip.count = zext nneg i32 %n to i64
br label %for.body

for.cond.cleanup.loopexit: ; preds = %for.body
br label %for.cond.cleanup

for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
ret void

for.body: ; preds = %for.body.preheader, %for.body
%indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
%arrayidx = getelementptr inbounds nuw i64, ptr %b, i64 %indvars.iv
%0 = load i64, ptr %arrayidx, align 8
%conv = sitofp i64 %0 to double
%arrayidx2 = getelementptr inbounds nuw double, ptr %c, i64 %indvars.iv
%1 = load double, ptr %arrayidx2, align 8
%add = fadd double %1, %conv
%conv3 = fptosi double %add to i16
%arrayidx5 = getelementptr inbounds nuw i16, ptr %a, i64 %indvars.iv
store i16 %conv3, ptr %arrayidx5, align 2
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
}
Loading