llvm · ElvisWang123 · Sep 25, 2024 · Sep 23, 2024 · Sep 23, 2024 · Sep 24, 2024
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -2787,6 +2787,10 @@ struct VPWidenStoreEVLRecipe final : public VPWidenMemoryRecipe {
   /// Generate the wide store or scatter.
   void execute(VPTransformState &State) override;
 
+  /// Return the cost of this VPWidenStoreEVLRecipe.
+  InstructionCost computeCost(ElementCount VF,
+                              VPCostContext &Ctx) const override;
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   /// Print the recipe.
   void print(raw_ostream &O, const Twine &Indent,

diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -2464,6 +2464,43 @@ void VPWidenStoreEVLRecipe::execute(VPTransformState &State) {
   State.addMetadata(NewSI, SI);
 }
 
+InstructionCost VPWidenStoreEVLRecipe::computeCost(ElementCount VF,
+                                                   VPCostContext &Ctx) const {
+  Type *Ty = ToVectorTy(getLoadStoreType(&Ingredient), VF);
+  const Align Alignment =
+      getLoadStoreAlignment(const_cast<Instruction *>(&Ingredient));
+  unsigned AS =
+      getLoadStoreAddressSpace(const_cast<Instruction *>(&Ingredient));
+  TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+
+  if (!Consecutive) {
+    // TODO: Using the original IR may not be accurate.
+    // Currently, ARM will use the underlying IR to calculate gather/scatter
+    // instruction cost.
+    const Value *Ptr = getLoadStorePointerOperand(&Ingredient);
+    assert(!Reverse &&
+           "Inconsecutive memory access should not have the order.");
+    return Ctx.TTI.getAddressComputationCost(Ty) +
+           Ctx.TTI.getGatherScatterOpCost(Ingredient.getOpcode(), Ty, Ptr,
+                                          IsMasked, Alignment, CostKind,
+                                          &Ingredient);
-    // TODO: Using the original IR may not be accurate.
-    // Currently, ARM will use the underlying IR to calculate gather/scatter
-    // instruction cost.
-    const Value *Ptr = getLoadStorePointerOperand(&Ingredient);
-    assert(!Reverse &&
-           "Inconsecutive memory access should not have the order.");
-    return Ctx.TTI.getAddressComputationCost(Ty) +
-           Ctx.TTI.getGatherScatterOpCost(Ingredient.getOpcode(), Ty, Ptr,
-                                          IsMasked, Alignment, CostKind,
-                                          &Ingredient);
+  return VPWidenStoreRecipe::computeCost();
-    // TODO: Using the original IR may not be accurate.
-    // Currently, ARM will use the underlying IR to calculate gather/scatter
-    // instruction cost.
-    const Value *Ptr = getLoadStorePointerOperand(&Ingredient);
-    assert(!Reverse &&
-           "Inconsecutive memory access should not have the order.");
-    return Ctx.TTI.getAddressComputationCost(Ty) +
-           Ctx.TTI.getGatherScatterOpCost(Ingredient.getOpcode(), Ty, Ptr,
-                                          IsMasked, Alignment, CostKind,
-                                          &Ingredient);
+  return VPWidenStoreRecipe::computeCost();
+  }
+
+  InstructionCost Cost = 0;
+  // We need to use the getMaskedMemoryOpCost() instead of getMemoryOpCost()
+  // here because the EVL recipes using EVL to replace the tail mask. But in the
+  // legacy model, it will always calculate the cost of mask.
+  // TODO: Using getMemoryOpCost() instead of getMaskedMemoryOpCost when we
+  // don't need to care the legacy cost model.
+  Cost += Ctx.TTI.getMaskedMemoryOpCost(Ingredient.getOpcode(), Ty, Alignment,
-  InstructionCost Cost = 0;
-  // We need to use the getMaskedMemoryOpCost() instead of getMemoryOpCost()
-  // here because the EVL recipes using EVL to replace the tail mask. But in the
-  // legacy model, it will always calculate the cost of mask.
-  // TODO: Using getMemoryOpCost() instead of getMaskedMemoryOpCost when we
-  // don't need to care the legacy cost model.
-  Cost += Ctx.TTI.getMaskedMemoryOpCost(Ingredient.getOpcode(), Ty, Alignment,
+  // We need to use the getMaskedMemoryOpCost() instead of getMemoryOpCost()
+  // here because the EVL recipes using EVL to replace the tail mask. But in the
+  // legacy model, it will always calculate the cost of mask.
+  // TODO: Using getMemoryOpCost() instead of getMaskedMemoryOpCost when we
+  // don't need to care the legacy cost model.
+  InstructionCost Cost = Ctx.TTI.getMaskedMemoryOpCost(Ingredient.getOpcode(), Ty, Alignment,
-  InstructionCost Cost = 0;
-  // We need to use the getMaskedMemoryOpCost() instead of getMemoryOpCost()
-  // here because the EVL recipes using EVL to replace the tail mask. But in the
-  // legacy model, it will always calculate the cost of mask.
-  // TODO: Using getMemoryOpCost() instead of getMaskedMemoryOpCost when we
-  // don't need to care the legacy cost model.
-  Cost += Ctx.TTI.getMaskedMemoryOpCost(Ingredient.getOpcode(), Ty, Alignment,
+  // We need to use the getMaskedMemoryOpCost() instead of getMemoryOpCost()
+  // here because the EVL recipes using EVL to replace the tail mask. But in the
+  // legacy model, it will always calculate the cost of mask.
+  // TODO: Using getMemoryOpCost() instead of getMaskedMemoryOpCost when we
+  // don't need to care the legacy cost model.
+  InstructionCost Cost = Ctx.TTI.getMaskedMemoryOpCost(Ingredient.getOpcode(), Ty, Alignment,
+                                        AS, CostKind);
+  if (!Reverse)
+    return Cost;
+
+  return Cost += Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
+                                        cast<VectorType>(Ty), {}, CostKind, 0);
+}
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPWidenStoreEVLRecipe::print(raw_ostream &O, const Twine &Indent,
                                   VPSlotTracker &SlotTracker) const {

diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll
@@ -0,0 +1,25 @@
+; RUN: opt < %s --prefer-predicate-over-epilogue=predicate-dont-vectorize --passes=loop-vectorize -mcpu=sifive-p470 -mattr=+v,+f
+; RUN: opt < %s --prefer-predicate-over-epilogue=predicate-dont-vectorize --passes=loop-vectorize -mcpu=sifive-p470 -mattr=+v,+f -force-tail-folding-style=data-with-evl
+; Generated from issue #109468.
+
+target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128"
+target triple = "riscv64-unknown-linux-gnu"
+
+define void @lshift_significand(i32 %n, ptr nocapture writeonly %0) local_unnamed_addr #0 {
-define void @lshift_significand(i32 %n, ptr nocapture writeonly %0) local_unnamed_addr #0 {
+define void @evl_store_cost(i32 %n, ptr nocapture writeonly %dst) {
-define void @lshift_significand(i32 %n, ptr nocapture writeonly %0) local_unnamed_addr #0 {
+define void @evl_store_cost(i32 %n, ptr nocapture writeonly %dst) {
+entry:
+  %cmp1.peel = icmp eq i32 %n, 0
+  %spec.select = select i1 %cmp1.peel, i64 2, i64 0
+  br label %for.body9
+
+for.body9:                                        ; preds = %entry, %for.body9
-for.body9:                                        ; preds = %entry, %for.body9
+loop:
-for.body9:                                        ; preds = %entry, %for.body9
+loop:
+  %indvars.iv = phi i64 [ %spec.select, %entry ], [ %indvars.iv.next, %for.body9 ]
-  %indvars.iv = phi i64 [ %spec.select, %entry ], [ %indvars.iv.next, %for.body9 ]
+  %iv  = phi i64 [ %spec.select, %entry ], [ %indvars.iv.next, %for.body9 ]
-  %indvars.iv = phi i64 [ %spec.select, %entry ], [ %indvars.iv.next, %for.body9 ]
+  %iv  = phi i64 [ %spec.select, %entry ], [ %indvars.iv.next, %for.body9 ]
+  %1 = sub nuw nsw i64 1, %indvars.iv
+  %arrayidx13 = getelementptr [3 x i64], ptr %0, i64 0, i64 %1
-  %arrayidx13 = getelementptr [3 x i64], ptr %0, i64 0, i64 %1
+  %arrayidx13 = getelementptr i64, ptr %0, i64 %1
-  %arrayidx13 = getelementptr [3 x i64], ptr %0, i64 0, i64 %1
+  %arrayidx13 = getelementptr i64, ptr %0, i64 %1
+  store i64 0, ptr %arrayidx13, align 8
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, 3
+  br i1 %exitcond.not, label %for.end16, label %for.body9
+
+for.end16:                                        ; preds = %for.body9
-for.end16:                                        ; preds = %for.body9
+exit:  
-for.end16:                                        ; preds = %for.body9
+exit:  
+  ret void
+}