Skip to content

Commit 98b6da1

Browse files
committed
[LV] strip TailFoldingStyle::DataWithoutLaneMask
There is just one usage of TailFoldingStyle::DataWithoutLaneMask in LoopVectorize, introduced by 413a66f ([LV, VP]VP intrinsics support for the Loop Vectorizer + adding new tail-folding mode using EVL.), but this usage is completely unnecessary, as @llvm.get.active.lane.mask is unrelated to EVL. Moreover, SelectionDAG automatically detects if a target supports the @llvm.get.active.lane.mask intrinsic, and lowers it to equivalent instructions on targets where it is not preferred, since 243a532 ([SelectionDAG] Lower @llvm.get.active.lane.mask to setcc).
1 parent cf2e828 commit 98b6da1

File tree

5 files changed

+67
-78
lines changed

5 files changed

+67
-78
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -252,7 +252,7 @@ static cl::opt<TailFoldingStyle> ForceTailFoldingStyle(
252252
"Similar to data-and-control, but remove the runtime check"),
253253
clEnumValN(TailFoldingStyle::DataWithEVL, "data-with-evl",
254254
"Use predicated EVL instructions for tail folding. If EVL "
255-
"is unsupported, fallback to data-without-lane-mask.")));
255+
"is unsupported, fallback to data.")));
256256

257257
static cl::opt<bool> MaximizeBandwidth(
258258
"vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
@@ -1451,12 +1451,10 @@ class LoopVectorizationCostModel {
14511451
TTI.hasActiveVectorLength(0, nullptr, Align()) &&
14521452
!EnableVPlanNativePath;
14531453
if (!EVLIsLegal) {
1454-
// If for some reason EVL mode is unsupported, fallback to
1455-
// DataWithoutLaneMask to try to vectorize the loop with folded tail
1456-
// in a generic way.
1454+
// If for some reason EVL mode is unsupported, fallback to Data to try to
1455+
// vectorize the loop with folded tail in a generic way.
14571456
ChosenTailFoldingStyle =
1458-
std::make_pair(TailFoldingStyle::DataWithoutLaneMask,
1459-
TailFoldingStyle::DataWithoutLaneMask);
1457+
std::make_pair(TailFoldingStyle::Data, TailFoldingStyle::Data);
14601458
LLVM_DEBUG(
14611459
dbgs()
14621460
<< "LV: Preference for VP intrinsics indicated. Will "

llvm/test/Transforms/LoopVectorize/PowerPC/vplan-force-tail-with-evl.ll

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,8 @@
88

99
define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
1010
; CHECK-LABEL: VPlan 'Initial VPlan for VF={2,4},UF>=1' {
11-
; CHECK-NEXT: Live-in vp<[[VF:%.+]]> = VF
1211
; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF
13-
; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count
14-
; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count
12+
; CHECK-NEXT: Live-in vp<[[VTC:%.*]]> = vector-trip-count
1513
; CHECK-NEXT: Live-in ir<%N> = original trip-count
1614
; CHECK-EMPTY:
1715
; CHECK-NEXT: ir-bb<entry>:
@@ -23,17 +21,16 @@ define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
2321
; CHECK-NEXT: <x1> vector loop: {
2422
; CHECK-NEXT: vector.body:
2523
; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[CAN_INC:%.*]]>
26-
; CHECK-NEXT: ir<%iv> = WIDEN-INDUCTION ir<0>, ir<1>, vp<[[VF]]>
27-
; CHECK-NEXT: EMIT vp<[[CMP:%.+]]> = icmp ule ir<%iv>, vp<[[BTC]]>
24+
; CHECK-NEXT: vp<[[STEPS:%.*]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>
25+
; CHECK-NEXT: EMIT vp<[[MASK:%.+]]> = active lane mask vp<[[STEPS]]>, ir<%N>
2826
; CHECK-NEXT: Successor(s): pred.store
2927
; CHECK-EMPTY:
3028
; CHECK-NEXT: <xVFxUF> pred.store: {
3129
; CHECK-NEXT: pred.store.entry:
32-
; CHECK-NEXT: BRANCH-ON-MASK vp<[[CMP]]>
30+
; CHECK-NEXT: BRANCH-ON-MASK vp<[[MASK]]>
3331
; CHECK-NEXT: Successor(s): pred.store.if, pred.store.continue
3432
; CHECK-EMPTY:
3533
; CHECK-NEXT: pred.store.if:
36-
; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>
3734
; CHECK-NEXT: REPLICATE ir<%arrayidx> = getelementptr inbounds ir<%b>, vp<[[STEPS]]>
3835
; CHECK-NEXT: REPLICATE ir<%0> = load ir<%arrayidx>
3936
; CHECK-NEXT: REPLICATE ir<%arrayidx2> = getelementptr inbounds ir<%c>, vp<[[STEPS]]>
@@ -49,7 +46,7 @@ define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
4946
; CHECK-NEXT: Successor(s): for.body.2
5047
; CHECK-EMPTY:
5148
; CHECK-NEXT: for.body.2:
52-
; CHECK-NEXT: EMIT vp<[[CAN_INC:%.+]]> = add vp<[[CAN_IV]]>, vp<[[VFxUF]]>
49+
; CHECK-NEXT: EMIT vp<[[CAN_INC]]> = add vp<[[CAN_IV]]>, vp<[[VFxUF]]>
5350
; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_INC]]>, vp<[[VTC]]>
5451
; CHECK-NEXT: No successors
5552
; CHECK-NEXT: }

llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-interleave.ll

Lines changed: 40 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,6 @@ define void @interleave(ptr noalias %a, ptr noalias %b, i64 %N) {
2525
; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]]
2626
; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
2727
; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
28-
; IF-EVL-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1
2928
; IF-EVL-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
3029
; IF-EVL-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4
3130
; IF-EVL-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 2
@@ -34,37 +33,40 @@ define void @interleave(ptr noalias %a, ptr noalias %b, i64 %N) {
3433
; IF-EVL-NEXT: [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP12]]
3534
; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP8]], i64 0
3635
; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
37-
; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
38-
; IF-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
3936
; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]]
4037
; IF-EVL: vector.body:
4138
; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
4239
; IF-EVL-NEXT: [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
4340
; IF-EVL-NEXT: [[STEP_ADD:%.*]] = add <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
4441
; IF-EVL-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 0
45-
; IF-EVL-NEXT: [[TMP19:%.*]] = icmp ule <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT2]]
46-
; IF-EVL-NEXT: [[TMP20:%.*]] = icmp ule <vscale x 4 x i64> [[STEP_ADD]], [[BROADCAST_SPLAT2]]
47-
; IF-EVL-NEXT: [[TMP21:%.*]] = getelementptr inbounds [2 x i32], ptr [[B:%.*]], <vscale x 4 x i64> [[VEC_IND]], i32 0
48-
; IF-EVL-NEXT: [[TMP22:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], <vscale x 4 x i64> [[STEP_ADD]], i32 0
49-
; IF-EVL-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP21]], i32 4, <vscale x 4 x i1> [[TMP19]], <vscale x 4 x i32> poison)
50-
; IF-EVL-NEXT: [[WIDE_MASKED_GATHER3:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP22]], i32 4, <vscale x 4 x i1> [[TMP20]], <vscale x 4 x i32> poison)
51-
; IF-EVL-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], <vscale x 4 x i64> [[VEC_IND]], i32 1
52-
; IF-EVL-NEXT: [[TMP24:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], <vscale x 4 x i64> [[STEP_ADD]], i32 1
53-
; IF-EVL-NEXT: [[WIDE_MASKED_GATHER4:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP23]], i32 4, <vscale x 4 x i1> [[TMP19]], <vscale x 4 x i32> poison)
54-
; IF-EVL-NEXT: [[WIDE_MASKED_GATHER5:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP24]], i32 4, <vscale x 4 x i1> [[TMP20]], <vscale x 4 x i32> poison)
55-
; IF-EVL-NEXT: [[TMP25:%.*]] = add nsw <vscale x 4 x i32> [[WIDE_MASKED_GATHER4]], [[WIDE_MASKED_GATHER]]
56-
; IF-EVL-NEXT: [[TMP26:%.*]] = add nsw <vscale x 4 x i32> [[WIDE_MASKED_GATHER5]], [[WIDE_MASKED_GATHER3]]
57-
; IF-EVL-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP13]]
58-
; IF-EVL-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[TMP27]], i32 0
59-
; IF-EVL-NEXT: [[TMP30:%.*]] = call i64 @llvm.vscale.i64()
60-
; IF-EVL-NEXT: [[TMP31:%.*]] = mul i64 [[TMP30]], 4
61-
; IF-EVL-NEXT: [[TMP32:%.*]] = getelementptr inbounds i32, ptr [[TMP27]], i64 [[TMP31]]
62-
; IF-EVL-NEXT: call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP25]], ptr [[TMP29]], i32 4, <vscale x 4 x i1> [[TMP19]])
63-
; IF-EVL-NEXT: call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP26]], ptr [[TMP32]], i32 4, <vscale x 4 x i1> [[TMP20]])
42+
; IF-EVL-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
43+
; IF-EVL-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 4
44+
; IF-EVL-NEXT: [[TMP16:%.*]] = add i64 [[TMP15]], 0
45+
; IF-EVL-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 1
46+
; IF-EVL-NEXT: [[TMP18:%.*]] = add i64 [[INDEX]], [[TMP17]]
47+
; IF-EVL-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP13]], i64 [[N]])
48+
; IF-EVL-NEXT: [[ACTIVE_LANE_MASK1:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP18]], i64 [[N]])
49+
; IF-EVL-NEXT: [[TMP19:%.*]] = getelementptr inbounds [2 x i32], ptr [[B:%.*]], <vscale x 4 x i64> [[VEC_IND]], i32 0
50+
; IF-EVL-NEXT: [[TMP20:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], <vscale x 4 x i64> [[STEP_ADD]], i32 0
51+
; IF-EVL-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP19]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
52+
; IF-EVL-NEXT: [[WIDE_MASKED_GATHER2:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP20]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK1]], <vscale x 4 x i32> poison)
53+
; IF-EVL-NEXT: [[TMP21:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], <vscale x 4 x i64> [[VEC_IND]], i32 1
54+
; IF-EVL-NEXT: [[TMP22:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], <vscale x 4 x i64> [[STEP_ADD]], i32 1
55+
; IF-EVL-NEXT: [[WIDE_MASKED_GATHER3:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP21]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
56+
; IF-EVL-NEXT: [[WIDE_MASKED_GATHER4:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP22]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK1]], <vscale x 4 x i32> poison)
57+
; IF-EVL-NEXT: [[TMP23:%.*]] = add nsw <vscale x 4 x i32> [[WIDE_MASKED_GATHER3]], [[WIDE_MASKED_GATHER]]
58+
; IF-EVL-NEXT: [[TMP24:%.*]] = add nsw <vscale x 4 x i32> [[WIDE_MASKED_GATHER4]], [[WIDE_MASKED_GATHER2]]
59+
; IF-EVL-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP13]]
60+
; IF-EVL-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[TMP25]], i32 0
61+
; IF-EVL-NEXT: [[TMP27:%.*]] = call i64 @llvm.vscale.i64()
62+
; IF-EVL-NEXT: [[TMP28:%.*]] = mul i64 [[TMP27]], 4
63+
; IF-EVL-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[TMP25]], i64 [[TMP28]]
64+
; IF-EVL-NEXT: call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP23]], ptr [[TMP26]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
65+
; IF-EVL-NEXT: call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP24]], ptr [[TMP29]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK1]])
6466
; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]]
6567
; IF-EVL-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[STEP_ADD]], [[BROADCAST_SPLAT]]
66-
; IF-EVL-NEXT: [[TMP33:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
67-
; IF-EVL-NEXT: br i1 [[TMP33]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
68+
; IF-EVL-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
69+
; IF-EVL-NEXT: br i1 [[TMP30]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
6870
; IF-EVL: middle.block:
6971
; IF-EVL-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
7072
; IF-EVL: scalar.ph:
@@ -73,10 +75,10 @@ define void @interleave(ptr noalias %a, ptr noalias %b, i64 %N) {
7375
; IF-EVL: for.body:
7476
; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
7577
; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], i64 [[IV]], i32 0
76-
; IF-EVL-NEXT: [[TMP34:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
78+
; IF-EVL-NEXT: [[TMP31:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
7779
; IF-EVL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], i64 [[IV]], i32 1
78-
; IF-EVL-NEXT: [[TMP35:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
79-
; IF-EVL-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP35]], [[TMP34]]
80+
; IF-EVL-NEXT: [[TMP32:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
81+
; IF-EVL-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP32]], [[TMP31]]
8082
; IF-EVL-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
8183
; IF-EVL-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4
8284
; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
@@ -120,15 +122,15 @@ define void @interleave(ptr noalias %a, ptr noalias %b, i64 %N) {
120122
; NO-VP-NEXT: [[TMP20:%.*]] = add nsw <vscale x 4 x i32> [[TMP16]], [[TMP15]]
121123
; NO-VP-NEXT: [[TMP21:%.*]] = add nsw <vscale x 4 x i32> [[TMP19]], [[TMP18]]
122124
; NO-VP-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP6]]
123-
; NO-VP-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[TMP22]], i32 0
124-
; NO-VP-NEXT: [[TMP25:%.*]] = call i64 @llvm.vscale.i64()
125-
; NO-VP-NEXT: [[TMP26:%.*]] = mul i64 [[TMP25]], 4
126-
; NO-VP-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[TMP22]], i64 [[TMP26]]
127-
; NO-VP-NEXT: store <vscale x 4 x i32> [[TMP20]], ptr [[TMP24]], align 4
128-
; NO-VP-NEXT: store <vscale x 4 x i32> [[TMP21]], ptr [[TMP27]], align 4
125+
; NO-VP-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[TMP22]], i32 0
126+
; NO-VP-NEXT: [[TMP24:%.*]] = call i64 @llvm.vscale.i64()
127+
; NO-VP-NEXT: [[TMP25:%.*]] = mul i64 [[TMP24]], 4
128+
; NO-VP-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[TMP22]], i64 [[TMP25]]
129+
; NO-VP-NEXT: store <vscale x 4 x i32> [[TMP20]], ptr [[TMP23]], align 4
130+
; NO-VP-NEXT: store <vscale x 4 x i32> [[TMP21]], ptr [[TMP26]], align 4
129131
; NO-VP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
130-
; NO-VP-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
131-
; NO-VP-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
132+
; NO-VP-NEXT: [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
133+
; NO-VP-NEXT: br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
132134
; NO-VP: middle.block:
133135
; NO-VP-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
134136
; NO-VP-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
@@ -138,10 +140,10 @@ define void @interleave(ptr noalias %a, ptr noalias %b, i64 %N) {
138140
; NO-VP: for.body:
139141
; NO-VP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
140142
; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], i64 [[IV]], i32 0
141-
; NO-VP-NEXT: [[TMP29:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
143+
; NO-VP-NEXT: [[TMP28:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
142144
; NO-VP-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], i64 [[IV]], i32 1
143-
; NO-VP-NEXT: [[TMP30:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
144-
; NO-VP-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP30]], [[TMP29]]
145+
; NO-VP-NEXT: [[TMP29:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
146+
; NO-VP-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP29]], [[TMP28]]
145147
; NO-VP-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
146148
; NO-VP-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4
147149
; NO-VP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1

llvm/test/Transforms/LoopVectorize/X86/vectorize-force-tail-with-evl.ll

Lines changed: 16 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -17,30 +17,24 @@ define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
1717
; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N:%.*]], 15
1818
; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 16
1919
; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
20-
; IF-EVL-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1
21-
; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <16 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
22-
; IF-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT1]], <16 x i64> poison, <16 x i32> zeroinitializer
2320
; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]]
2421
; IF-EVL: vector.body:
2522
; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
2623
; IF-EVL-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
27-
; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i64> poison, i64 [[INDEX]], i64 0
28-
; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT]], <16 x i64> poison, <16 x i32> zeroinitializer
29-
; IF-EVL-NEXT: [[VEC_IV:%.*]] = add <16 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
30-
; IF-EVL-NEXT: [[TMP1:%.*]] = icmp ule <16 x i64> [[VEC_IV]], [[BROADCAST_SPLAT2]]
31-
; IF-EVL-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP0]]
32-
; IF-EVL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0
33-
; IF-EVL-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP3]], i32 4, <16 x i1> [[TMP1]], <16 x i32> poison)
34-
; IF-EVL-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i64 [[TMP0]]
35-
; IF-EVL-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
36-
; IF-EVL-NEXT: [[WIDE_MASKED_LOAD3:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP5]], i32 4, <16 x i1> [[TMP1]], <16 x i32> poison)
37-
; IF-EVL-NEXT: [[TMP6:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD3]], [[WIDE_MASKED_LOAD]]
38-
; IF-EVL-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]]
39-
; IF-EVL-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
40-
; IF-EVL-NEXT: call void @llvm.masked.store.v16i32.p0(<16 x i32> [[TMP6]], ptr [[TMP8]], i32 4, <16 x i1> [[TMP1]])
24+
; IF-EVL-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 [[TMP0]], i64 [[N]])
25+
; IF-EVL-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP0]]
26+
; IF-EVL-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
27+
; IF-EVL-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP2]], i32 4, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i32> poison)
28+
; IF-EVL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i64 [[TMP0]]
29+
; IF-EVL-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 0
30+
; IF-EVL-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP4]], i32 4, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i32> poison)
31+
; IF-EVL-NEXT: [[TMP5:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD1]], [[WIDE_MASKED_LOAD]]
32+
; IF-EVL-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]]
33+
; IF-EVL-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0
34+
; IF-EVL-NEXT: call void @llvm.masked.store.v16i32.p0(<16 x i32> [[TMP5]], ptr [[TMP7]], i32 4, <16 x i1> [[ACTIVE_LANE_MASK]])
4135
; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16
42-
; IF-EVL-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
43-
; IF-EVL-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
36+
; IF-EVL-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
37+
; IF-EVL-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
4438
; IF-EVL: middle.block:
4539
; IF-EVL-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
4640
; IF-EVL: scalar.ph:
@@ -49,10 +43,10 @@ define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
4943
; IF-EVL: for.body:
5044
; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
5145
; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
52-
; IF-EVL-NEXT: [[TMP10:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
46+
; IF-EVL-NEXT: [[TMP9:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
5347
; IF-EVL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]]
54-
; IF-EVL-NEXT: [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
55-
; IF-EVL-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP11]], [[TMP10]]
48+
; IF-EVL-NEXT: [[TMP10:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
49+
; IF-EVL-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP10]], [[TMP9]]
5650
; IF-EVL-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
5751
; IF-EVL-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4
5852
; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1

0 commit comments

Comments
 (0)