diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 027ee21527d22..52acf79da0ac7 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1431,12 +1431,9 @@ class LoopVectorizationCostModel { // Override forced styles if needed. // FIXME: use actual opcode/data type for analysis here. // FIXME: Investigate opportunity for fixed vector factor. - bool EVLIsLegal = - IsScalableVF && UserIC <= 1 && - TTI.hasActiveVectorLength(0, nullptr, Align()) && - !EnableVPlanNativePath && - // FIXME: implement support for max safe dependency distance. - Legal->isSafeForAnyVectorWidth(); + bool EVLIsLegal = UserIC <= 1 && + TTI.hasActiveVectorLength(0, nullptr, Align()) && + !EnableVPlanNativePath; if (!EVLIsLegal) { // If for some reason EVL mode is unsupported, fallback to // DataWithoutLaneMask to try to vectorize the loop with folded tail @@ -1461,6 +1458,15 @@ class LoopVectorizationCostModel { return getTailFoldingStyle() != TailFoldingStyle::None; } + /// Return maximum safe number of elements to be processed per vector + /// iteration, which do not prevent store-load forwarding and are safe with + /// regard to the memory dependencies. Required for EVL-based VPlans to + /// correctly calculate AVL (application vector length) as min(remaining AVL, + /// MaxSafeElements). + /// TODO: need to consider adjusting cost model to use this value as a + /// vectorization factor for EVL-based vectorization. + std::optional getMaxSafeElements() const { return MaxSafeElements; } + /// Returns true if the instructions in this block requires predication /// for any reason, e.g. because tail folding now requires a predicate /// or because the block in the original loop was predicated. @@ -1612,6 +1618,12 @@ class LoopVectorizationCostModel { /// true if scalable vectorization is supported and enabled. std::optional IsScalableVectorizationAllowed; + /// Maximum safe number of elements to be processed per vector iteration, + /// which do not prevent store-load forwarding and are safe with regard to the + /// memory dependencies. Required for EVL-based veectorization, where this + /// value is used as the upper bound of the safe AVL. + std::optional MaxSafeElements; + /// A map holding scalar costs for different vectorization factors. The /// presence of a cost for an instruction in the mapping indicates that the /// instruction will be scalarized when vectorizing with the associated @@ -3858,6 +3870,8 @@ FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF( auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements); auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements); + if (!Legal->isSafeForAnyVectorWidth()) + this->MaxSafeElements = MaxSafeElements; LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF << ".\n"); @@ -8686,8 +8700,8 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, VPlanTransforms::optimize(*Plan); // TODO: try to put it close to addActiveLaneMask(). // Discard the plan if it is not EVL-compatible - if (CM.foldTailWithEVL() && - !VPlanTransforms::tryAddExplicitVectorLength(*Plan)) + if (CM.foldTailWithEVL() && !VPlanTransforms::tryAddExplicitVectorLength( + *Plan, CM.getMaxSafeElements())) break; assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid"); VPlans.push_back(std::move(Plan)); diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 368d6e58a5578..fc5961df3eec2 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -386,6 +386,7 @@ bool VPInstruction::canGenerateScalarForFirstLane() const { return true; switch (Opcode) { case Instruction::ICmp: + case Instruction::Select: case VPInstruction::BranchOnCond: case VPInstruction::BranchOnCount: case VPInstruction::CalculateTripCountMinusVF: @@ -434,9 +435,10 @@ Value *VPInstruction::generate(VPTransformState &State) { return Builder.CreateCmp(getPredicate(), A, B, Name); } case Instruction::Select: { - Value *Cond = State.get(getOperand(0)); - Value *Op1 = State.get(getOperand(1)); - Value *Op2 = State.get(getOperand(2)); + bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this); + Value *Cond = State.get(getOperand(0), OnlyFirstLaneUsed); + Value *Op1 = State.get(getOperand(1), OnlyFirstLaneUsed); + Value *Op2 = State.get(getOperand(2), OnlyFirstLaneUsed); return Builder.CreateSelect(Cond, Op1, Op2, Name); } case VPInstruction::ActiveLaneMask: { @@ -736,6 +738,7 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const { default: return false; case Instruction::ICmp: + case Instruction::Select: case VPInstruction::PtrAdd: // TODO: Cover additional opcodes. return vputils::onlyFirstLaneUsed(this); diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 379bfc0a4394b..0035f5ecf7515 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -1430,7 +1430,24 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) { /// %NextEVLIV = add IVSize (cast i32 %VPEVVL to IVSize), %EVLPhi /// ... /// -bool VPlanTransforms::tryAddExplicitVectorLength(VPlan &Plan) { +/// If MaxSafeElements is provided, the function adds the following recipes: +/// vector.ph: +/// ... +/// +/// vector.body: +/// ... +/// %EVLPhi = EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI [ %StartV, %vector.ph ], +/// [ %NextEVLIV, %vector.body ] +/// %AVL = sub original TC, %EVLPhi +/// %cmp = cmp ult %AVL, MaxSafeElements +/// %SAFE_AVL = select %cmp, %AVL, MaxSafeElements +/// %VPEVL = EXPLICIT-VECTOR-LENGTH %SAFE_AVL +/// ... +/// %NextEVLIV = add IVSize (cast i32 %VPEVL to IVSize), %EVLPhi +/// ... +/// +bool VPlanTransforms::tryAddExplicitVectorLength( + VPlan &Plan, const std::optional &MaxSafeElements) { VPBasicBlock *Header = Plan.getVectorLoopRegion()->getEntryBasicBlock(); // The transform updates all users of inductions to work based on EVL, instead // of the VF directly. At the moment, widened inductions cannot be updated, so @@ -1455,14 +1472,19 @@ bool VPlanTransforms::tryAddExplicitVectorLength(VPlan &Plan) { // Create the ExplicitVectorLengthPhi recipe in the main loop. auto *EVLPhi = new VPEVLBasedIVPHIRecipe(StartV, DebugLoc()); EVLPhi->insertAfter(CanonicalIVPHI); - // TODO: Add support for MaxSafeDist for correct loop emission. + VPBuilder Builder(Header, Header->getFirstNonPhi()); // Compute original TC - IV as the AVL (application vector length). - auto *AVL = new VPInstruction(Instruction::Sub, {Plan.getTripCount(), EVLPhi}, - DebugLoc(), "avl"); - AVL->insertBefore(*Header, Header->getFirstNonPhi()); - auto *VPEVL = - new VPInstruction(VPInstruction::ExplicitVectorLength, AVL, DebugLoc()); - VPEVL->insertAfter(AVL); + VPValue *AVL = Builder.createNaryOp( + Instruction::Sub, {Plan.getTripCount(), EVLPhi}, DebugLoc(), "avl"); + if (MaxSafeElements) { + // Support for MaxSafeDist for correct loop emission. + VPValue *AVLSafe = Plan.getOrAddLiveIn( + ConstantInt::get(CanonicalIVPHI->getScalarType(), *MaxSafeElements)); + VPValue *Cmp = Builder.createICmp(ICmpInst::ICMP_ULT, AVL, AVLSafe); + AVL = Builder.createSelect(Cmp, AVL, AVLSafe, DebugLoc(), "safe_avl"); + } + auto *VPEVL = Builder.createNaryOp(VPInstruction::ExplicitVectorLength, AVL, + DebugLoc()); auto *CanonicalIVIncrement = cast(CanonicalIVPHI->getBackedgeValue()); diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index 3b792ee32dce6..60a44bfb0dca6 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -108,7 +108,9 @@ struct VPlanTransforms { /// VPCanonicalIVPHIRecipe is only used to control the loop after /// this transformation. /// \returns true if the transformation succeeds, or false if it doesn't. - static bool tryAddExplicitVectorLength(VPlan &Plan); + static bool + tryAddExplicitVectorLength(VPlan &Plan, + const std::optional &MaxEVLSafeElements); // For each Interleave Group in \p InterleaveGroups replace the Recipes // widening its memory instructions with a single VPInterleaveRecipe at its diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-safe-dep-distance.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-safe-dep-distance.ll index 2dd47d5c1ea8a..322a6c16871ae 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-safe-dep-distance.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-safe-dep-distance.ll @@ -422,28 +422,37 @@ define void @no_high_lmul_or_interleave(ptr %p) { ; IF-EVL-NEXT: entry: ; IF-EVL-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; IF-EVL: vector.ph: +; IF-EVL-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP1:%.*]] = sub i64 [[TMP7]], 1 +; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 3002, [[TMP1]] +; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP7]] +; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; IF-EVL-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() ; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]] ; IF-EVL: vector.body: ; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; IF-EVL-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX]], i64 0 -; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer -; IF-EVL-NEXT: [[VEC_IV:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], -; IF-EVL-NEXT: [[TMP1:%.*]] = icmp ule <4 x i64> [[VEC_IV]], +; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[AVL:%.*]] = sub i64 3002, [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[TMP9:%.*]] = icmp ult i64 [[AVL]], 1024 +; IF-EVL-NEXT: [[SAFE_AVL:%.*]] = select i1 [[TMP9]], i64 [[AVL]], i64 1024 +; IF-EVL-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[SAFE_AVL]], i32 1, i1 true) +; IF-EVL-NEXT: [[TMP0:%.*]] = add i64 [[EVL_BASED_IV]], 0 ; IF-EVL-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP0]] ; IF-EVL-NEXT: [[TMP3:%.*]] = getelementptr i64, ptr [[TMP2]], i32 0 -; IF-EVL-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr [[TMP3]], i32 32, <4 x i1> [[TMP1]], <4 x i64> poison) +; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv1i64.p0(ptr align 32 [[TMP3]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) ; IF-EVL-NEXT: [[TMP4:%.*]] = add i64 [[TMP0]], 1024 ; IF-EVL-NEXT: [[TMP5:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP4]] ; IF-EVL-NEXT: [[TMP6:%.*]] = getelementptr i64, ptr [[TMP5]], i32 0 -; IF-EVL-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> [[WIDE_MASKED_LOAD]], ptr [[TMP6]], i32 32, <4 x i1> [[TMP1]]) -; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 -; IF-EVL-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 3004 -; IF-EVL-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; IF-EVL-NEXT: call void @llvm.vp.store.nxv1i64.p0( [[VP_OP_LOAD]], ptr align 32 [[TMP6]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) +; IF-EVL-NEXT: [[TMP11:%.*]] = zext i32 [[TMP10]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP11]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] +; IF-EVL-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; IF-EVL: middle.block: ; IF-EVL-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; IF-EVL: scalar.ph: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 3004, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] ; IF-EVL-NEXT: br label [[LOOP:%.*]] ; IF-EVL: loop: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]