llvm · ElvisWang123 · Sep 25, 2024 · Sep 23, 2024 · Sep 23, 2024 · Sep 24, 2024
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -2709,6 +2709,10 @@ struct VPWidenLoadEVLRecipe final : public VPWidenMemoryRecipe, public VPValue {
   /// Generate the wide load or gather.
   void execute(VPTransformState &State) override;
 
+  /// Return the cost of this VPWidenLoadEVLRecipe.
+  InstructionCost computeCost(ElementCount VF,
+                              VPCostContext &Ctx) const override;
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   /// Print the recipe.
   void print(raw_ostream &O, const Twine &Indent,
@@ -2787,6 +2791,10 @@ struct VPWidenStoreEVLRecipe final : public VPWidenMemoryRecipe {
   /// Generate the wide store or scatter.
   void execute(VPTransformState &State) override;
 
+  /// Return the cost of this VPWidenStoreEVLRecipe.
+  InstructionCost computeCost(ElementCount VF,
+                              VPCostContext &Ctx) const override;
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   /// Print the recipe.
   void print(raw_ostream &O, const Twine &Indent,

diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -2364,6 +2364,32 @@ void VPWidenLoadEVLRecipe::execute(VPTransformState &State) {
   State.set(this, Res, 0);
 }
 
+InstructionCost VPWidenLoadEVLRecipe::computeCost(ElementCount VF,
+                                                  VPCostContext &Ctx) const {
+  if (!Consecutive || IsMasked) {
+    return VPWidenMemoryRecipe::computeCost(VF, Ctx);
+  }
+
+  // We need to use the getMaskedMemoryOpCost() instead of getMemoryOpCost()
+  // here because the EVL recipes using EVL to replace the tail mask. But in the
+  // legacy model, it will always calculate the cost of mask.
+  // TODO: Using getMemoryOpCost() instead of getMaskedMemoryOpCost when we
+  // don't need to compare to the legacy cost model.
+  Type *Ty = ToVectorTy(getLoadStoreType(&Ingredient), VF);
+  const Align Alignment =
+      getLoadStoreAlignment(const_cast<Instruction *>(&Ingredient));
+  unsigned AS =
+      getLoadStoreAddressSpace(const_cast<Instruction *>(&Ingredient));
+  TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+  InstructionCost Cost = Ctx.TTI.getMaskedMemoryOpCost(
+      Ingredient.getOpcode(), Ty, Alignment, AS, CostKind);
+  if (!Reverse)
+    return Cost;
+
+  return Cost + Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
+                                       cast<VectorType>(Ty), {}, CostKind, 0);
+}
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPWidenLoadEVLRecipe::print(raw_ostream &O, const Twine &Indent,
                                  VPSlotTracker &SlotTracker) const {
@@ -2464,6 +2490,32 @@ void VPWidenStoreEVLRecipe::execute(VPTransformState &State) {
   State.addMetadata(NewSI, SI);
 }
 
+InstructionCost VPWidenStoreEVLRecipe::computeCost(ElementCount VF,
+                                                   VPCostContext &Ctx) const {
+  if (!Consecutive || IsMasked) {
+    return VPWidenMemoryRecipe::computeCost(VF, Ctx);
+  }
+
+  // We need to use the getMaskedMemoryOpCost() instead of getMemoryOpCost()
+  // here because the EVL recipes using EVL to replace the tail mask. But in the
+  // legacy model, it will always calculate the cost of mask.
+  // TODO: Using getMemoryOpCost() instead of getMaskedMemoryOpCost when we
+  // don't need to compare to the legacy cost model.
+  Type *Ty = ToVectorTy(getLoadStoreType(&Ingredient), VF);
+  const Align Alignment =
+      getLoadStoreAlignment(const_cast<Instruction *>(&Ingredient));
+  unsigned AS =
+      getLoadStoreAddressSpace(const_cast<Instruction *>(&Ingredient));
+  TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+  InstructionCost Cost = Ctx.TTI.getMaskedMemoryOpCost(
+      Ingredient.getOpcode(), Ty, Alignment, AS, CostKind);
+  if (!Reverse)
+    return Cost;
+
+  return Cost + Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
+                                       cast<VectorType>(Ty), {}, CostKind, 0);
+}
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPWidenStoreEVLRecipe::print(raw_ostream &O, const Twine &Indent,
                                   VPSlotTracker &SlotTracker) const {

diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll
@@ -0,0 +1,144 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s --prefer-predicate-over-epilogue=predicate-dont-vectorize --passes=loop-vectorize -mcpu=sifive-p470 -mattr=+v,+f -S | FileCheck %s --check-prefixes=CHECK
+; RUN: opt < %s --prefer-predicate-over-epilogue=predicate-dont-vectorize --passes=loop-vectorize -mcpu=sifive-p470 -mattr=+v,+f -force-tail-folding-style=data-with-evl -S | FileCheck %s --check-prefixes=EVL
+; Generated from issue #109468.
+; In this test case, the vector store with tail mask will transfer to the vp intrinsic with EVL.
+
+target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128"
+target triple = "riscv64-unknown-linux-gnu"
+
+define void @lshift_significand(i32 %n, ptr nocapture writeonly %dst) {
+; CHECK-LABEL: define void @lshift_significand(
+; CHECK-SAME: i32 [[N:%.*]], ptr nocapture writeonly [[DST:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[CMP1_PEEL:%.*]] = icmp eq i32 [[N]], 0
+; CHECK-NEXT:    [[SPEC_SELECT:%.*]] = select i1 [[CMP1_PEEL]], i64 2, i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = sub i64 3, [[SPEC_SELECT]]
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TMP1]], 3
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[IND_END:%.*]] = add i64 [[SPEC_SELECT]], [[N_VEC]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 [[SPEC_SELECT]], [[EVL_BASED_IV]]
+; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[EVL_BASED_IV]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[VEC_IV:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i64> [[VEC_IV]], i32 0
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[TMP3]], i64 [[TMP1]])
+; CHECK-NEXT:    [[TMP14:%.*]] = sub nuw nsw i64 1, [[TMP13]]
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP14]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i64, ptr [[TMP15]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i64, ptr [[TMP6]], i32 -3
+; CHECK-NEXT:    [[REVERSE:%.*]] = shufflevector <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i1> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr [[TMP7]], i32 8, <4 x i1> [[REVERSE]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[EVL_BASED_IV]], 4
+; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ [[SPEC_SELECT]], %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[TMP24:%.*]] = sub nuw nsw i64 1, [[IV]]
+; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP24]]
+; CHECK-NEXT:    store i64 0, ptr [[ARRAYIDX13]], align 8
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 3
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+; EVL-LABEL: define void @lshift_significand(
+; EVL-SAME: i32 [[N:%.*]], ptr nocapture writeonly [[DST:%.*]]) #[[ATTR0:[0-9]+]] {
+; EVL-NEXT:  [[ENTRY:.*]]:
+; EVL-NEXT:    [[CMP1_PEEL:%.*]] = icmp eq i32 [[N]], 0
+; EVL-NEXT:    [[SPEC_SELECT:%.*]] = select i1 [[CMP1_PEEL]], i64 2, i64 0
+; EVL-NEXT:    [[TMP1:%.*]] = sub i64 3, [[SPEC_SELECT]]
+; EVL-NEXT:    [[TMP2:%.*]] = sub i64 -1, [[TMP1]]
+; EVL-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 2
+; EVL-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[TMP2]], [[TMP4]]
+; EVL-NEXT:    br i1 [[TMP5]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; EVL:       [[VECTOR_PH]]:
+; EVL-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 2
+; EVL-NEXT:    [[TMP8:%.*]] = sub i64 [[TMP7]], 1
+; EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TMP1]], [[TMP8]]
+; EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP7]]
+; EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; EVL-NEXT:    [[IND_END:%.*]] = add i64 [[SPEC_SELECT]], [[N_VEC]]
+; EVL-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 2
+; EVL-NEXT:    br label %[[VECTOR_BODY:.*]]
+; EVL:       [[VECTOR_BODY]]:
+; EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; EVL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; EVL-NEXT:    [[TMP11:%.*]] = sub i64 [[TMP1]], [[EVL_BASED_IV]]
+; EVL-NEXT:    [[TMP12:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP11]], i32 2, i1 true)
+; EVL-NEXT:    [[OFFSET_IDX:%.*]] = add i64 [[SPEC_SELECT]], [[EVL_BASED_IV]]
+; EVL-NEXT:    [[TMP13:%.*]] = add i64 [[OFFSET_IDX]], 0
+; EVL-NEXT:    [[TMP14:%.*]] = sub nuw nsw i64 1, [[TMP13]]
+; EVL-NEXT:    [[TMP15:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP14]]
+; EVL-NEXT:    [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT:    [[TMP17:%.*]] = mul i64 [[TMP16]], 2
+; EVL-NEXT:    [[TMP18:%.*]] = mul i64 0, [[TMP17]]
+; EVL-NEXT:    [[TMP19:%.*]] = sub i64 1, [[TMP17]]
+; EVL-NEXT:    [[TMP20:%.*]] = getelementptr i64, ptr [[TMP15]], i64 [[TMP18]]
+; EVL-NEXT:    [[TMP21:%.*]] = getelementptr i64, ptr [[TMP20]], i64 [[TMP19]]
+; EVL-NEXT:    [[VP_REVERSE:%.*]] = call <vscale x 2 x i64> @llvm.experimental.vp.reverse.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 [[TMP12]])
+; EVL-NEXT:    call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[VP_REVERSE]], ptr align 8 [[TMP21]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 [[TMP12]])
+; EVL-NEXT:    [[TMP22:%.*]] = zext i32 [[TMP12]] to i64
+; EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP22]], [[EVL_BASED_IV]]
+; EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP10]]
+; EVL-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; EVL-NEXT:    br i1 [[TMP23]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; EVL:       [[MIDDLE_BLOCK]]:
+; EVL-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; EVL:       [[SCALAR_PH]]:
+; EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ [[SPEC_SELECT]], %[[ENTRY]] ]
+; EVL-NEXT:    br label %[[LOOP:.*]]
+; EVL:       [[LOOP]]:
+; EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; EVL-NEXT:    [[TMP24:%.*]] = sub nuw nsw i64 1, [[IV]]
+; EVL-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP24]]
+; EVL-NEXT:    store i64 0, ptr [[ARRAYIDX13]], align 8
+; EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 3
+; EVL-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; EVL:       [[EXIT]]:
+; EVL-NEXT:    ret void
+;
+entry:
+  %cmp1.peel = icmp eq i32 %n, 0
+  %spec.select = select i1 %cmp1.peel, i64 2, i64 0
+  br label %loop
+
+loop:
+  %iv = phi i64 [ %spec.select, %entry ], [ %iv.next, %loop ]
+  %1 = sub nuw nsw i64 1, %iv
+  %arrayidx13 = getelementptr i64, ptr %dst, i64 %1
+  store i64 0, ptr %arrayidx13, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 3
+  br i1 %exitcond.not, label %exit, label %loop
+
+exit:
+  ret void
+}
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+;.
+; EVL: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; EVL: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; EVL: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; EVL: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+;.