Skip to content

Commit a068b97

Browse files
authored
[VPlan] Implement VPWidenLoad/StoreEVLRecipe::computeCost(). (#109644)
Currently the EVL recipes transfer the tail masking to the EVL. But in the legacy cost model, the mask exist and will calculate the instruction cost of the mask. To fix the difference between the VPlan-based cost model and the legacy cost model, we always calculate the instruction cost for the mask in the EVL recipes. Note that we should remove the mask cost in the EVL recipes when we don't need to compare to the legacy cost model. This patch also fixes #109468.
1 parent 5103910 commit a068b97

File tree

3 files changed

+150
-0
lines changed

3 files changed

+150
-0
lines changed

llvm/lib/Transforms/Vectorize/VPlan.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2709,6 +2709,10 @@ struct VPWidenLoadEVLRecipe final : public VPWidenMemoryRecipe, public VPValue {
27092709
/// Generate the wide load or gather.
27102710
void execute(VPTransformState &State) override;
27112711

2712+
/// Return the cost of this VPWidenLoadEVLRecipe.
2713+
InstructionCost computeCost(ElementCount VF,
2714+
VPCostContext &Ctx) const override;
2715+
27122716
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
27132717
/// Print the recipe.
27142718
void print(raw_ostream &O, const Twine &Indent,
@@ -2787,6 +2791,10 @@ struct VPWidenStoreEVLRecipe final : public VPWidenMemoryRecipe {
27872791
/// Generate the wide store or scatter.
27882792
void execute(VPTransformState &State) override;
27892793

2794+
/// Return the cost of this VPWidenStoreEVLRecipe.
2795+
InstructionCost computeCost(ElementCount VF,
2796+
VPCostContext &Ctx) const override;
2797+
27902798
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
27912799
/// Print the recipe.
27922800
void print(raw_ostream &O, const Twine &Indent,

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2267,6 +2267,31 @@ void VPWidenLoadEVLRecipe::execute(VPTransformState &State) {
22672267
State.set(this, Res);
22682268
}
22692269

2270+
InstructionCost VPWidenLoadEVLRecipe::computeCost(ElementCount VF,
2271+
VPCostContext &Ctx) const {
2272+
if (!Consecutive || IsMasked)
2273+
return VPWidenMemoryRecipe::computeCost(VF, Ctx);
2274+
2275+
// We need to use the getMaskedMemoryOpCost() instead of getMemoryOpCost()
2276+
// here because the EVL recipes using EVL to replace the tail mask. But in the
2277+
// legacy model, it will always calculate the cost of mask.
2278+
// TODO: Using getMemoryOpCost() instead of getMaskedMemoryOpCost when we
2279+
// don't need to compare to the legacy cost model.
2280+
Type *Ty = ToVectorTy(getLoadStoreType(&Ingredient), VF);
2281+
const Align Alignment =
2282+
getLoadStoreAlignment(const_cast<Instruction *>(&Ingredient));
2283+
unsigned AS =
2284+
getLoadStoreAddressSpace(const_cast<Instruction *>(&Ingredient));
2285+
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
2286+
InstructionCost Cost = Ctx.TTI.getMaskedMemoryOpCost(
2287+
Ingredient.getOpcode(), Ty, Alignment, AS, CostKind);
2288+
if (!Reverse)
2289+
return Cost;
2290+
2291+
return Cost + Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
2292+
cast<VectorType>(Ty), {}, CostKind, 0);
2293+
}
2294+
22702295
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
22712296
void VPWidenLoadEVLRecipe::print(raw_ostream &O, const Twine &Indent,
22722297
VPSlotTracker &SlotTracker) const {
@@ -2363,6 +2388,31 @@ void VPWidenStoreEVLRecipe::execute(VPTransformState &State) {
23632388
State.addMetadata(NewSI, SI);
23642389
}
23652390

2391+
InstructionCost VPWidenStoreEVLRecipe::computeCost(ElementCount VF,
2392+
VPCostContext &Ctx) const {
2393+
if (!Consecutive || IsMasked)
2394+
return VPWidenMemoryRecipe::computeCost(VF, Ctx);
2395+
2396+
// We need to use the getMaskedMemoryOpCost() instead of getMemoryOpCost()
2397+
// here because the EVL recipes using EVL to replace the tail mask. But in the
2398+
// legacy model, it will always calculate the cost of mask.
2399+
// TODO: Using getMemoryOpCost() instead of getMaskedMemoryOpCost when we
2400+
// don't need to compare to the legacy cost model.
2401+
Type *Ty = ToVectorTy(getLoadStoreType(&Ingredient), VF);
2402+
const Align Alignment =
2403+
getLoadStoreAlignment(const_cast<Instruction *>(&Ingredient));
2404+
unsigned AS =
2405+
getLoadStoreAddressSpace(const_cast<Instruction *>(&Ingredient));
2406+
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
2407+
InstructionCost Cost = Ctx.TTI.getMaskedMemoryOpCost(
2408+
Ingredient.getOpcode(), Ty, Alignment, AS, CostKind);
2409+
if (!Reverse)
2410+
return Cost;
2411+
2412+
return Cost + Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
2413+
cast<VectorType>(Ty), {}, CostKind, 0);
2414+
}
2415+
23662416
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
23672417
void VPWidenStoreEVLRecipe::print(raw_ostream &O, const Twine &Indent,
23682418
VPSlotTracker &SlotTracker) const {
Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
2+
; RUN: opt < %s --prefer-predicate-over-epilogue=predicate-dont-vectorize --passes=loop-vectorize -mcpu=sifive-p470 -mattr=+v,+f -force-tail-folding-style=data-with-evl -S | FileCheck %s
3+
; Generated from issue #109468.
4+
; In this test case, the vector store with tail mask will transfer to the vp intrinsic with EVL.
5+
6+
target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128"
7+
target triple = "riscv64-unknown-linux-gnu"
8+
9+
define void @lshift_significand(i32 %n, ptr nocapture writeonly %dst) {
10+
; CHECK-LABEL: define void @lshift_significand(
11+
; CHECK-SAME: i32 [[N:%.*]], ptr nocapture writeonly [[DST:%.*]]) #[[ATTR0:[0-9]+]] {
12+
; CHECK-NEXT: [[ENTRY:.*]]:
13+
; CHECK-NEXT: [[CMP1_PEEL:%.*]] = icmp eq i32 [[N]], 0
14+
; CHECK-NEXT: [[SPEC_SELECT:%.*]] = select i1 [[CMP1_PEEL]], i64 2, i64 0
15+
; CHECK-NEXT: [[TMP0:%.*]] = sub i64 3, [[SPEC_SELECT]]
16+
; CHECK-NEXT: [[TMP1:%.*]] = sub i64 -1, [[TMP0]]
17+
; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
18+
; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2
19+
; CHECK-NEXT: [[TMP4:%.*]] = icmp ult i64 [[TMP1]], [[TMP3]]
20+
; CHECK-NEXT: br i1 [[TMP4]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
21+
; CHECK: [[VECTOR_PH]]:
22+
; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
23+
; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 2
24+
; CHECK-NEXT: [[TMP7:%.*]] = sub i64 [[TMP6]], 1
25+
; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[TMP0]], [[TMP7]]
26+
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP6]]
27+
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
28+
; CHECK-NEXT: [[IND_END:%.*]] = add i64 [[SPEC_SELECT]], [[N_VEC]]
29+
; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
30+
; CHECK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 2
31+
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
32+
; CHECK: [[VECTOR_BODY]]:
33+
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
34+
; CHECK-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
35+
; CHECK-NEXT: [[TMP10:%.*]] = sub i64 [[TMP0]], [[EVL_BASED_IV]]
36+
; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP10]], i32 2, i1 true)
37+
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 [[SPEC_SELECT]], [[EVL_BASED_IV]]
38+
; CHECK-NEXT: [[TMP12:%.*]] = add i64 [[OFFSET_IDX]], 0
39+
; CHECK-NEXT: [[TMP13:%.*]] = sub nuw nsw i64 1, [[TMP12]]
40+
; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP13]]
41+
; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
42+
; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 2
43+
; CHECK-NEXT: [[TMP17:%.*]] = mul i64 0, [[TMP16]]
44+
; CHECK-NEXT: [[TMP18:%.*]] = sub i64 1, [[TMP16]]
45+
; CHECK-NEXT: [[TMP19:%.*]] = getelementptr i64, ptr [[TMP14]], i64 [[TMP17]]
46+
; CHECK-NEXT: [[TMP20:%.*]] = getelementptr i64, ptr [[TMP19]], i64 [[TMP18]]
47+
; CHECK-NEXT: [[VP_REVERSE:%.*]] = call <vscale x 2 x i64> @llvm.experimental.vp.reverse.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 [[TMP11]])
48+
; CHECK-NEXT: call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[VP_REVERSE]], ptr align 8 [[TMP20]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 [[TMP11]])
49+
; CHECK-NEXT: [[TMP21:%.*]] = zext i32 [[TMP11]] to i64
50+
; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP21]], [[EVL_BASED_IV]]
51+
; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]]
52+
; CHECK-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
53+
; CHECK-NEXT: br i1 [[TMP22]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
54+
; CHECK: [[MIDDLE_BLOCK]]:
55+
; CHECK-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
56+
; CHECK: [[SCALAR_PH]]:
57+
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ [[SPEC_SELECT]], %[[ENTRY]] ]
58+
; CHECK-NEXT: br label %[[LOOP:.*]]
59+
; CHECK: [[LOOP]]:
60+
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
61+
; CHECK-NEXT: [[TMP23:%.*]] = sub nuw nsw i64 1, [[IV]]
62+
; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP23]]
63+
; CHECK-NEXT: store i64 0, ptr [[ARRAYIDX13]], align 8
64+
; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
65+
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 3
66+
; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
67+
; CHECK: [[EXIT]]:
68+
; CHECK-NEXT: ret void
69+
;
70+
entry:
71+
%cmp1.peel = icmp eq i32 %n, 0
72+
%spec.select = select i1 %cmp1.peel, i64 2, i64 0
73+
br label %loop
74+
75+
loop:
76+
%iv = phi i64 [ %spec.select, %entry ], [ %iv.next, %loop ]
77+
%1 = sub nuw nsw i64 1, %iv
78+
%arrayidx13 = getelementptr i64, ptr %dst, i64 %1
79+
store i64 0, ptr %arrayidx13, align 8
80+
%iv.next = add nuw nsw i64 %iv, 1
81+
%exitcond.not = icmp eq i64 %iv.next, 3
82+
br i1 %exitcond.not, label %exit, label %loop
83+
84+
exit:
85+
ret void
86+
}
87+
;.
88+
; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
89+
; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
90+
; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
91+
; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
92+
;.

0 commit comments

Comments
 (0)