Skip to content
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 21 additions & 8 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1431,12 +1431,9 @@ class LoopVectorizationCostModel {
// Override forced styles if needed.
// FIXME: use actual opcode/data type for analysis here.
// FIXME: Investigate opportunity for fixed vector factor.
bool EVLIsLegal =
IsScalableVF && UserIC <= 1 &&
TTI.hasActiveVectorLength(0, nullptr, Align()) &&
!EnableVPlanNativePath &&
// FIXME: implement support for max safe dependency distance.
Legal->isSafeForAnyVectorWidth();
bool EVLIsLegal = UserIC <= 1 &&
TTI.hasActiveVectorLength(0, nullptr, Align()) &&
!EnableVPlanNativePath;
Comment on lines +1434 to +1436
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(unrelated to this patch) Should this fallback of EVL, due to UserIC>1 or NativePath, apply to getPreferredTailFoldingStyle() decisions above, in addition to (de)forced styles below?

if (!EVLIsLegal) {
// If for some reason EVL mode is unsupported, fallback to
// DataWithoutLaneMask to try to vectorize the loop with folded tail
Expand All @@ -1461,6 +1458,14 @@ class LoopVectorizationCostModel {
return getTailFoldingStyle() != TailFoldingStyle::None;
}

/// Return maximum safe number of elements to be processed, which do not
/// prevent store-load forwarding and are safe with regard to the memory
/// dependencies. Required for EVL-based VPlans to correctly calculate AVL
/// (application vector length) as min(remaining AVL, MaxSafeElements).
/// TODO: need to consider adjusting cost model to use this value as a
/// vectorization factor for EVL-based vectorization.
std::optional<unsigned> getMaxSafeElements() const { return MaxSafeElements; }

/// Returns true if the instructions in this block requires predication
/// for any reason, e.g. because tail folding now requires a predicate
/// or because the block in the original loop was predicated.
Expand Down Expand Up @@ -1612,6 +1617,12 @@ class LoopVectorizationCostModel {
/// true if scalable vectorization is supported and enabled.
std::optional<bool> IsScalableVectorizationAllowed;

/// Maximum safe number of elements to be processed per vector iteration,
/// which do not prevent store-load forwarding and are safe with regard to the
/// memory dependencies. Required for EVL-based veectorization, where this
/// value is used as the upper bound of the safe AVL.
std::optional<unsigned> MaxSafeElements;

/// A map holding scalar costs for different vectorization factors. The
/// presence of a cost for an instruction in the mapping indicates that the
/// instruction will be scalarized when vectorizing with the associated
Expand Down Expand Up @@ -3858,6 +3869,8 @@ FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(

auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements);
auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements);
if (!Legal->isSafeForAnyVectorWidth())
this->MaxSafeElements = MaxSafeElements;

LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF
<< ".\n");
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(Also dump max EVL safe VF?)
(while we're here, the next else is redundant - follows a return)

Expand Down Expand Up @@ -8686,8 +8699,8 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
VPlanTransforms::optimize(*Plan);
// TODO: try to put it close to addActiveLaneMask().
// Discard the plan if it is not EVL-compatible
if (CM.foldTailWithEVL() &&
!VPlanTransforms::tryAddExplicitVectorLength(*Plan))
if (CM.foldTailWithEVL() && !VPlanTransforms::tryAddExplicitVectorLength(
*Plan, CM.getMaxSafeElements()))
break;
assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
VPlans.push_back(std::move(Plan));
Expand Down
9 changes: 6 additions & 3 deletions llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -386,6 +386,7 @@ bool VPInstruction::canGenerateScalarForFirstLane() const {
return true;
switch (Opcode) {
case Instruction::ICmp:
case Instruction::Select:
case VPInstruction::BranchOnCond:
case VPInstruction::BranchOnCount:
case VPInstruction::CalculateTripCountMinusVF:
Expand Down Expand Up @@ -434,9 +435,10 @@ Value *VPInstruction::generate(VPTransformState &State) {
return Builder.CreateCmp(getPredicate(), A, B, Name);
}
case Instruction::Select: {
Value *Cond = State.get(getOperand(0));
Value *Op1 = State.get(getOperand(1));
Value *Op2 = State.get(getOperand(2));
bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
Value *Cond = State.get(getOperand(0), OnlyFirstLaneUsed);
Value *Op1 = State.get(getOperand(1), OnlyFirstLaneUsed);
Value *Op2 = State.get(getOperand(2), OnlyFirstLaneUsed);
return Builder.CreateSelect(Cond, Op1, Op2, Name);
}
case VPInstruction::ActiveLaneMask: {
Expand Down Expand Up @@ -736,6 +738,7 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
default:
return false;
case Instruction::ICmp:
case Instruction::Select:
case VPInstruction::PtrAdd:
// TODO: Cover additional opcodes.
return vputils::onlyFirstLaneUsed(this);
Expand Down
38 changes: 30 additions & 8 deletions llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1430,7 +1430,24 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
/// %NextEVLIV = add IVSize (cast i32 %VPEVVL to IVSize), %EVLPhi
/// ...
///
bool VPlanTransforms::tryAddExplicitVectorLength(VPlan &Plan) {
/// If MaxSafeElements is provided, the function adds the following recipes:
/// vector.ph:
/// ...
///
/// vector.body:
/// ...
/// %EVLPhi = EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI [ %StartV, %vector.ph ],
/// [ %NextEVLIV, %vector.body ]
/// %AVL = sub original TC, %EVLPhi
/// %cmp = cmp ult %AVL, MaxSafeElements
/// %SAFE_AVL = select %cmp, %AVL, MaxSafeElements
/// %VPEVL = EXPLICIT-VECTOR-LENGTH %SAFE_AVL
/// ...
/// %NextEVLIV = add IVSize (cast i32 %VPEVL to IVSize), %EVLPhi
/// ...
///
bool VPlanTransforms::tryAddExplicitVectorLength(
VPlan &Plan, const std::optional<unsigned> &MaxSafeElements) {
VPBasicBlock *Header = Plan.getVectorLoopRegion()->getEntryBasicBlock();
// The transform updates all users of inductions to work based on EVL, instead
// of the VF directly. At the moment, widened inductions cannot be updated, so
Expand All @@ -1455,14 +1472,19 @@ bool VPlanTransforms::tryAddExplicitVectorLength(VPlan &Plan) {
// Create the ExplicitVectorLengthPhi recipe in the main loop.
auto *EVLPhi = new VPEVLBasedIVPHIRecipe(StartV, DebugLoc());
EVLPhi->insertAfter(CanonicalIVPHI);
// TODO: Add support for MaxSafeDist for correct loop emission.
VPBuilder Builder(Header, Header->getFirstNonPhi());
// Compute original TC - IV as the AVL (application vector length).
auto *AVL = new VPInstruction(Instruction::Sub, {Plan.getTripCount(), EVLPhi},
DebugLoc(), "avl");
AVL->insertBefore(*Header, Header->getFirstNonPhi());
auto *VPEVL =
new VPInstruction(VPInstruction::ExplicitVectorLength, AVL, DebugLoc());
VPEVL->insertAfter(AVL);
VPValue *AVL = Builder.createNaryOp(
Instruction::Sub, {Plan.getTripCount(), EVLPhi}, DebugLoc(), "avl");
if (MaxSafeElements) {
// Support for MaxSafeDist for correct loop emission.
VPValue *AVLSafe = Plan.getOrAddLiveIn(
ConstantInt::get(CanonicalIVPHI->getScalarType(), *MaxSafeElements));
VPValue *Cmp = Builder.createICmp(ICmpInst::ICMP_ULT, AVL, AVLSafe);
AVL = Builder.createSelect(Cmp, AVL, AVLSafe, DebugLoc(), "safe_avl");
}
auto *VPEVL = Builder.createNaryOp(VPInstruction::ExplicitVectorLength, AVL,
DebugLoc());

auto *CanonicalIVIncrement =
cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue());
Expand Down
4 changes: 3 additions & 1 deletion llvm/lib/Transforms/Vectorize/VPlanTransforms.h
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,9 @@ struct VPlanTransforms {
/// VPCanonicalIVPHIRecipe is only used to control the loop after
/// this transformation.
/// \returns true if the transformation succeeds, or false if it doesn't.
static bool tryAddExplicitVectorLength(VPlan &Plan);
static bool
tryAddExplicitVectorLength(VPlan &Plan,
const std::optional<unsigned> &MaxEVLSafeElements);

// For each Interleave Group in \p InterleaveGroups replace the Recipes
// widening its memory instructions with a single VPInterleaveRecipe at its
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -422,28 +422,37 @@ define void @no_high_lmul_or_interleave(ptr %p) {
; IF-EVL-NEXT: entry:
; IF-EVL-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; IF-EVL: vector.ph:
; IF-EVL-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
; IF-EVL-NEXT: [[TMP1:%.*]] = sub i64 [[TMP7]], 1
; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 3002, [[TMP1]]
; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP7]]
; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
; IF-EVL-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]]
; IF-EVL: vector.body:
; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; IF-EVL-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX]], i64 0
; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
; IF-EVL-NEXT: [[VEC_IV:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3>
; IF-EVL-NEXT: [[TMP1:%.*]] = icmp ule <4 x i64> [[VEC_IV]], <i64 3001, i64 3001, i64 3001, i64 3001>
; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
; IF-EVL-NEXT: [[AVL:%.*]] = sub i64 3002, [[EVL_BASED_IV]]
; IF-EVL-NEXT: [[TMP9:%.*]] = icmp ult i64 [[AVL]], 1024
; IF-EVL-NEXT: [[SAFE_AVL:%.*]] = select i1 [[TMP9]], i64 [[AVL]], i64 1024
; IF-EVL-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[SAFE_AVL]], i32 1, i1 true)
; IF-EVL-NEXT: [[TMP0:%.*]] = add i64 [[EVL_BASED_IV]], 0
; IF-EVL-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP0]]
; IF-EVL-NEXT: [[TMP3:%.*]] = getelementptr i64, ptr [[TMP2]], i32 0
; IF-EVL-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr [[TMP3]], i32 32, <4 x i1> [[TMP1]], <4 x i64> poison)
; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 1 x i64> @llvm.vp.load.nxv1i64.p0(ptr align 32 [[TMP3]], <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), i32 [[TMP10]])
; IF-EVL-NEXT: [[TMP4:%.*]] = add i64 [[TMP0]], 1024
; IF-EVL-NEXT: [[TMP5:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP4]]
; IF-EVL-NEXT: [[TMP6:%.*]] = getelementptr i64, ptr [[TMP5]], i32 0
; IF-EVL-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> [[WIDE_MASKED_LOAD]], ptr [[TMP6]], i32 32, <4 x i1> [[TMP1]])
; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4
; IF-EVL-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 3004
; IF-EVL-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
; IF-EVL-NEXT: call void @llvm.vp.store.nxv1i64.p0(<vscale x 1 x i64> [[VP_OP_LOAD]], ptr align 32 [[TMP6]], <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), i32 [[TMP10]])
; IF-EVL-NEXT: [[TMP11:%.*]] = zext i32 [[TMP10]] to i64
; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP11]], [[EVL_BASED_IV]]
; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]]
; IF-EVL-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; IF-EVL-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
; IF-EVL: middle.block:
; IF-EVL-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
; IF-EVL: scalar.ph:
; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 3004, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
; IF-EVL-NEXT: br label [[LOOP:%.*]]
; IF-EVL: loop:
; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
Expand Down
Loading