Skip to content

Commit d4951b7

Browse files
committed
[VPlan] Introduce chainUsesScalarValues
Introduce chainUsesScalarValues to dig through a recipe-chain, skipping widening decisions, and determine if the final leaves use only scalar values of the given root. Demonstrate its utility in narrowToSingleScalarRecipes, showing that it is essentially a drop-in replacement for onlyScalarValuesUsed for the purposes of optimizations.
1 parent 2545209 commit d4951b7

13 files changed

+88
-101
lines changed

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

Lines changed: 4 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1402,16 +1402,11 @@ static void narrowToSingleScalarRecipes(VPlan &Plan) {
14021402
continue;
14031403
}
14041404

1405-
// Skip recipes that aren't single scalars or don't have only their
1406-
// scalar results used. In the latter case, we would introduce extra
1407-
// broadcasts.
1405+
// Only consider recipes that are single scalars whose scalar value is
1406+
// used by the final leaves in a recipe-chain walk: if the final leaves
1407+
// don't use the scalar value, it could introduce extra broadcasts.
14081408
if (!vputils::isSingleScalar(RepOrWidenR) ||
1409-
!all_of(RepOrWidenR->users(), [RepOrWidenR](const VPUser *U) {
1410-
return U->usesScalars(RepOrWidenR) ||
1411-
match(cast<VPRecipeBase>(U),
1412-
m_CombineOr(m_ExtractLastElement(m_VPValue()),
1413-
m_ExtractLastLanePerPart(m_VPValue())));
1414-
}))
1409+
!vputils::chainUsesScalarValues(RepOrWidenR))
14151410
continue;
14161411

14171412
auto *Clone = new VPReplicateRecipe(RepOrWidenR->getUnderlyingInstr(),

llvm/lib/Transforms/Vectorize/VPlanUtils.cpp

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,30 @@ bool vputils::onlyScalarValuesUsed(const VPValue *Def) {
3131
[Def](const VPUser *U) { return U->usesScalars(Def); });
3232
}
3333

34+
bool vputils::chainUsesScalarValues(const VPValue *Root) {
35+
SmallVector<std::pair<const VPValue *, const VPUser *>> Worklist;
36+
for (const VPUser *V : Root->users())
37+
Worklist.emplace_back(Root, V);
38+
while (!Worklist.empty()) {
39+
auto [Op, U] = Worklist.pop_back_val();
40+
if (isa<VPWidenRecipe, VPWidenCastRecipe, VPWidenCallRecipe,
41+
VPWidenGEPRecipe, VPWidenSelectRecipe, VPWidenIntrinsicRecipe>(U)) {
42+
const VPValue *Def = cast<VPSingleDefRecipe>(U);
43+
for (const VPUser *V : Def->users())
44+
Worklist.emplace_back(Def, V);
45+
continue;
46+
}
47+
if (isa<VPWidenMemoryRecipe>(U) && vputils::isSingleScalar(Op))
48+
continue;
49+
if (auto *VPI = dyn_cast<VPInstruction>(U))
50+
if (VPI->isVectorToScalar() || VPI->isSingleScalar())
51+
continue;
52+
if (!U->usesScalars(Op))
53+
return false;
54+
}
55+
return true;
56+
}
57+
3458
VPValue *vputils::getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr) {
3559
VPValue *Expanded = nullptr;
3660
if (auto *E = dyn_cast<SCEVConstant>(Expr))

llvm/lib/Transforms/Vectorize/VPlanUtils.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,11 @@ bool onlyFirstPartUsed(const VPValue *Def);
2828
/// Returns true if only scalar values of \p Def are used by all users.
2929
bool onlyScalarValuesUsed(const VPValue *Def);
3030

31+
/// Digs through a chain of recipes starting from \p Root, skipping widening
32+
/// decisions, and determines if the final leaves use only scalar values of \p
33+
/// Root.
34+
bool chainUsesScalarValues(const VPValue *Root);
35+
3136
/// Get or create a VPValue that corresponds to the expansion of \p Expr. If \p
3237
/// Expr is a SCEVConstant or SCEVUnknown, return a VPValue wrapping the live-in
3338
/// value. Otherwise return a VPExpandSCEVRecipe to expand \p Expr. If \p Plan's

llvm/test/Transforms/LoopVectorize/AArch64/extractvalue-no-scalarization-required.ll

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
; CM: vector.ph:
1717
; CM: CLONE ir<%a> = extractvalue ir<%sv>
1818
; CM: CLONE ir<%b> = extractvalue ir<%sv>
19-
; CM: WIDEN ir<%add> = add ir<%a>, ir<%b>
19+
; CM: CLONE ir<%add> = add ir<%a>, ir<%b>
2020
; CM: Successor(s): vector loop
2121

2222
; CM: LV: Scalar loop costs: 5.
@@ -30,17 +30,15 @@ define void @test1(ptr %dst, {i64, i64} %sv) {
3030
; FORCED-NEXT: br label %[[VECTOR_PH:.*]]
3131
; FORCED: [[VECTOR_PH]]:
3232
; FORCED-NEXT: [[TMP0:%.*]] = extractvalue { i64, i64 } [[SV]], 0
33-
; FORCED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TMP0]], i64 0
34-
; FORCED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
3533
; FORCED-NEXT: [[TMP4:%.*]] = extractvalue { i64, i64 } [[SV]], 1
36-
; FORCED-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x i64> poison, i64 [[TMP4]], i64 0
34+
; FORCED-NEXT: [[TMP5:%.*]] = add i64 [[TMP0]], [[TMP4]]
35+
; FORCED-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x i64> poison, i64 [[TMP5]], i64 0
3736
; FORCED-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT1]], <2 x i64> poison, <2 x i32> zeroinitializer
38-
; FORCED-NEXT: [[TMP1:%.*]] = add <2 x i64> [[BROADCAST_SPLAT]], [[BROADCAST_SPLAT2]]
3937
; FORCED-NEXT: br label %[[VECTOR_BODY:.*]]
4038
; FORCED: [[VECTOR_BODY]]:
4139
; FORCED-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
4240
; FORCED-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[DST]], i32 [[INDEX]]
43-
; FORCED-NEXT: store <2 x i64> [[TMP1]], ptr [[TMP2]], align 4
41+
; FORCED-NEXT: store <2 x i64> [[BROADCAST_SPLAT2]], ptr [[TMP2]], align 4
4442
; FORCED-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
4543
; FORCED-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000
4644
; FORCED-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]

llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-extractvalue.ll

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -17,17 +17,15 @@ define void @widen_extractvalue(ptr %dst, {i64, i64} %sv) #0 {
1717
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 1000, [[TMP3]]
1818
; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 1000, [[N_MOD_VF]]
1919
; CHECK-NEXT: [[EXTRACT0:%.*]] = extractvalue { i64, i64 } [[SV]], 0
20-
; CHECK-NEXT: [[DOTSPLATINSERT1:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[EXTRACT0]], i64 0
21-
; CHECK-NEXT: [[DOTSPLAT2:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT1]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
2220
; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { i64, i64 } [[SV]], 1
23-
; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP10]], i64 0
21+
; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[EXTRACT0]], [[TMP10]]
22+
; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP6]], i64 0
2423
; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
25-
; CHECK-NEXT: [[TMP7:%.*]] = add <vscale x 2 x i64> [[DOTSPLAT2]], [[BROADCAST_SPLAT2]]
2624
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
2725
; CHECK: [[VECTOR_BODY]]:
2826
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
2927
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i64, ptr [[DST]], i32 [[INDEX]]
30-
; CHECK-NEXT: store <vscale x 2 x i64> [[TMP7]], ptr [[TMP8]], align 8
28+
; CHECK-NEXT: store <vscale x 2 x i64> [[BROADCAST_SPLAT2]], ptr [[TMP8]], align 8
3129
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP3]]
3230
; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
3331
; CHECK-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]

llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -293,9 +293,9 @@ define void @test_phi_in_latch_redundant(ptr %dst, i32 %a) {
293293
; CHECK-NEXT: [[ENTRY:.*:]]
294294
; CHECK-NEXT: br label %[[VECTOR_PH:.*]]
295295
; CHECK: [[VECTOR_PH]]:
296-
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[A]], i64 0
296+
; CHECK-NEXT: [[TMP0:%.*]] = xor i32 [[A]], -1
297+
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP0]], i64 0
297298
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
298-
; CHECK-NEXT: [[TMP19:%.*]] = xor <vscale x 4 x i32> [[BROADCAST_SPLAT]], splat (i32 -1)
299299
; CHECK-NEXT: [[TMP6:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
300300
; CHECK-NEXT: [[TMP7:%.*]] = mul <vscale x 4 x i64> [[TMP6]], splat (i64 9)
301301
; CHECK-NEXT: [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP7]]
@@ -309,7 +309,7 @@ define void @test_phi_in_latch_redundant(ptr %dst, i32 %a) {
309309
; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP9]], i64 0
310310
; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
311311
; CHECK-NEXT: [[TMP16:%.*]] = getelementptr i32, ptr [[DST]], <vscale x 4 x i64> [[VEC_IND]]
312-
; CHECK-NEXT: call void @llvm.vp.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP19]], <vscale x 4 x ptr> align 4 [[TMP16]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP8]])
312+
; CHECK-NEXT: call void @llvm.vp.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], <vscale x 4 x ptr> align 4 [[TMP16]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP8]])
313313
; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP5]]
314314
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT2]]
315315
; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[AVL_NEXT]], 0

llvm/test/Transforms/LoopVectorize/X86/cost-model.ll

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -334,7 +334,7 @@ define void @multi_exit(ptr %dst, ptr %src.1, ptr %src.2, i64 %A, i64 %B) #0 {
334334
; CHECK-NEXT: [[TMP1:%.*]] = freeze i64 [[TMP0]]
335335
; CHECK-NEXT: [[UMIN7:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP1]], i64 [[A:%.*]])
336336
; CHECK-NEXT: [[TMP2:%.*]] = add nuw i64 [[UMIN7]], 1
337-
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP2]], 28
337+
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP2]], 14
338338
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
339339
; CHECK: vector.scevcheck:
340340
; CHECK-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[B]], i64 1)
@@ -369,14 +369,12 @@ define void @multi_exit(ptr %dst, ptr %src.1, ptr %src.2, i64 %A, i64 %B) #0 {
369369
; CHECK: vector.body:
370370
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
371371
; CHECK-NEXT: [[TMP13:%.*]] = load i64, ptr [[SRC_1]], align 8, !alias.scope [[META6:![0-9]+]]
372-
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TMP13]], i64 0
373-
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
374372
; CHECK-NEXT: [[TMP14:%.*]] = load i64, ptr [[SRC_2]], align 8, !alias.scope [[META9:![0-9]+]]
375-
; CHECK-NEXT: [[BROADCAST_SPLATINSERT9:%.*]] = insertelement <2 x i64> poison, i64 [[TMP14]], i64 0
376-
; CHECK-NEXT: [[BROADCAST_SPLAT10:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT9]], <2 x i64> poison, <2 x i32> zeroinitializer
377-
; CHECK-NEXT: [[TMP15:%.*]] = icmp eq <2 x i64> [[BROADCAST_SPLAT]], zeroinitializer
378-
; CHECK-NEXT: [[TMP16:%.*]] = icmp ne <2 x i64> [[BROADCAST_SPLAT10]], zeroinitializer
379-
; CHECK-NEXT: [[TMP17:%.*]] = and <2 x i1> [[TMP16]], [[TMP15]]
373+
; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[TMP13]], 0
374+
; CHECK-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP14]], 0
375+
; CHECK-NEXT: [[TMP21:%.*]] = and i1 [[TMP16]], [[TMP15]]
376+
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i1> poison, i1 [[TMP21]], i64 0
377+
; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <2 x i1> [[BROADCAST_SPLATINSERT]], <2 x i1> poison, <2 x i32> zeroinitializer
380378
; CHECK-NEXT: [[TMP18:%.*]] = zext <2 x i1> [[TMP17]] to <2 x i8>
381379
; CHECK-NEXT: [[TMP19:%.*]] = extractelement <2 x i8> [[TMP18]], i32 1
382380
; CHECK-NEXT: store i8 [[TMP19]], ptr [[DST]], align 1, !alias.scope [[META11:![0-9]+]], !noalias [[META13:![0-9]+]]

llvm/test/Transforms/LoopVectorize/X86/replicate-recipe-with-only-first-lane-used.ll

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -199,10 +199,6 @@ define float @uniform_load_replicating_select(ptr %A, ptr %B, i64 %1) {
199199
; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 7
200200
; CHECK-NEXT: [[TMP6:%.*]] = load float, ptr [[A]], align 4
201201
; CHECK-NEXT: [[TMP10:%.*]] = fcmp ogt float [[TMP6]], 0.000000e+00
202-
; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i1> poison, i1 [[TMP10]], i32 0
203-
; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x i1> [[TMP8]], i1 [[TMP10]], i32 1
204-
; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x i1> [[TMP9]], i1 [[TMP10]], i32 2
205-
; CHECK-NEXT: [[TMP14:%.*]] = insertelement <4 x i1> [[TMP13]], i1 [[TMP10]], i32 3
206202
; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP2]]
207203
; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP3]]
208204
; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP4]]
@@ -211,7 +207,9 @@ define float @uniform_load_replicating_select(ptr %A, ptr %B, i64 %1) {
211207
; CHECK-NEXT: [[TMP20:%.*]] = select i1 [[TMP10]], ptr [[A]], ptr [[TMP16]]
212208
; CHECK-NEXT: [[TMP21:%.*]] = select i1 [[TMP10]], ptr [[A]], ptr [[TMP17]]
213209
; CHECK-NEXT: [[TMP22:%.*]] = select i1 [[TMP10]], ptr [[A]], ptr [[TMP18]]
214-
; CHECK-NEXT: [[TMP23:%.*]] = select <4 x i1> [[TMP14]], <4 x float> splat (float 1.000000e+01), <4 x float> splat (float 1.000000e+00)
210+
; CHECK-NEXT: [[TMP36:%.*]] = select i1 [[TMP10]], float 1.000000e+01, float 1.000000e+00
211+
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[TMP36]], i64 0
212+
; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
215213
; CHECK-NEXT: [[TMP24:%.*]] = load float, ptr [[TMP19]], align 4
216214
; CHECK-NEXT: [[TMP25:%.*]] = load float, ptr [[TMP20]], align 4
217215
; CHECK-NEXT: [[TMP26:%.*]] = load float, ptr [[TMP21]], align 4

llvm/test/Transforms/LoopVectorize/X86/uniform_load.ll

Lines changed: 12 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -64,39 +64,24 @@ exit:
6464
define void @uniform_load_can_fold_users(ptr noalias %src, ptr %dst, i64 %start, double %d) {
6565
; CHECK-LABEL: define void @uniform_load_can_fold_users(
6666
; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr [[DST:%.*]], i64 [[START:%.*]], double [[D:%.*]]) {
67-
; CHECK-NEXT: [[ENTRY:.*:]]
68-
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[START]], 1
69-
; CHECK-NEXT: [[SMIN:%.*]] = call i64 @llvm.smin.i64(i64 [[START]], i64 0)
70-
; CHECK-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[SMIN]]
71-
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], 2
72-
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], [[SCALAR_PH:label %.*]], label %[[VECTOR_PH:.*]]
73-
; CHECK: [[VECTOR_PH]]:
74-
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], 2
75-
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]]
76-
; CHECK-NEXT: [[TMP2:%.*]] = sub i64 [[START]], [[N_VEC]]
77-
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
78-
; CHECK: [[VECTOR_BODY]]:
79-
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
80-
; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0
81-
; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 1
67+
; CHECK-NEXT: [[ENTRY:.*]]:
68+
; CHECK-NEXT: br label %[[LOOP:.*]]
69+
; CHECK: [[LOOP]]:
70+
; CHECK-NEXT: [[TMP4:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_1_NEXT:%.*]], %[[LOOP]] ]
71+
; CHECK-NEXT: [[IV_2:%.*]] = phi i64 [ [[START]], %[[ENTRY]] ], [ [[IV_2_NEXT:%.*]], %[[LOOP]] ]
8272
; CHECK-NEXT: [[TMP5:%.*]] = load double, ptr [[SRC]], align 8
83-
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x double> poison, double [[TMP5]], i64 0
84-
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT]], <2 x double> poison, <2 x i32> zeroinitializer
85-
; CHECK-NEXT: [[TMP6:%.*]] = fmul <2 x double> [[BROADCAST_SPLAT]], splat (double 9.000000e+00)
86-
; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[TMP6]], i32 0
73+
; CHECK-NEXT: [[TMP7:%.*]] = fmul double [[TMP5]], 9.000000e+00
8774
; CHECK-NEXT: [[TMP8:%.*]] = fdiv double [[TMP7]], [[D]]
88-
; CHECK-NEXT: [[TMP9:%.*]] = sub i64 [[TMP3]], 1
8975
; CHECK-NEXT: [[TMP10:%.*]] = sub i64 [[TMP4]], 1
90-
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP3]]
9176
; CHECK-NEXT: [[TMP12:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP4]]
92-
; CHECK-NEXT: [[TMP13:%.*]] = getelementptr double, ptr [[TMP11]], i64 [[TMP9]]
9377
; CHECK-NEXT: [[TMP14:%.*]] = getelementptr double, ptr [[TMP12]], i64 [[TMP10]]
94-
; CHECK-NEXT: store double [[TMP8]], ptr [[TMP13]], align 8
9578
; CHECK-NEXT: store double [[TMP8]], ptr [[TMP14]], align 8
96-
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
97-
; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
98-
; CHECK-NEXT: br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
99-
; CHECK: [[MIDDLE_BLOCK]]:
79+
; CHECK-NEXT: [[IV_1_NEXT]] = add i64 [[TMP4]], 1
80+
; CHECK-NEXT: [[IV_2_NEXT]] = add i64 [[IV_2]], -1
81+
; CHECK-NEXT: [[EC:%.*]] = icmp sgt i64 [[IV_2]], 0
82+
; CHECK-NEXT: br i1 [[EC]], label %[[LOOP]], label %[[EXIT:.*]]
83+
; CHECK: [[EXIT]]:
84+
; CHECK-NEXT: ret void
10085
;
10186
entry:
10287
br label %loop

llvm/test/Transforms/LoopVectorize/first-order-recurrence-with-uniform-ops.ll

Lines changed: 6 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -134,21 +134,18 @@ define i16 @for_phi_removed(ptr %src) {
134134
; UNROLL-NO-IC: [[VECTOR_BODY]]:
135135
; UNROLL-NO-IC-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
136136
; UNROLL-NO-IC-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC]], align 4
137-
; UNROLL-NO-IC-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i64 0
138-
; UNROLL-NO-IC-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
139-
; UNROLL-NO-IC-NEXT: [[TMP1:%.*]] = icmp eq <4 x i32> [[BROADCAST_SPLAT]], zeroinitializer
140-
; UNROLL-NO-IC-NEXT: [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x i16> splat (i16 1), <4 x i16> zeroinitializer
137+
; UNROLL-NO-IC-NEXT: [[TMP1:%.*]] = icmp eq i32 [[TMP0]], 0
138+
; UNROLL-NO-IC-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i16 1, i16 0
141139
; UNROLL-NO-IC-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
142140
; UNROLL-NO-IC-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 104
143141
; UNROLL-NO-IC-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
144142
; UNROLL-NO-IC: [[MIDDLE_BLOCK]]:
145-
; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
146143
; UNROLL-NO-IC-NEXT: br label %[[SCALAR_PH:.*]]
147144
; UNROLL-NO-IC: [[SCALAR_PH]]:
148145
; UNROLL-NO-IC-NEXT: br label %[[LOOP:.*]]
149146
; UNROLL-NO-IC: [[LOOP]]:
150147
; UNROLL-NO-IC-NEXT: [[IV:%.*]] = phi i16 [ 104, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
151-
; UNROLL-NO-IC-NEXT: [[P:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], %[[SCALAR_PH]] ], [ [[SEL:%.*]], %[[LOOP]] ]
148+
; UNROLL-NO-IC-NEXT: [[P:%.*]] = phi i16 [ [[TMP2]], %[[SCALAR_PH]] ], [ [[SEL:%.*]], %[[LOOP]] ]
152149
; UNROLL-NO-IC-NEXT: [[L:%.*]] = load i32, ptr [[SRC]], align 4
153150
; UNROLL-NO-IC-NEXT: [[C:%.*]] = icmp eq i32 [[L]], 0
154151
; UNROLL-NO-IC-NEXT: [[SEL]] = select i1 [[C]], i16 1, i16 0
@@ -199,21 +196,18 @@ define i16 @for_phi_removed(ptr %src) {
199196
; SINK-AFTER: [[VECTOR_BODY]]:
200197
; SINK-AFTER-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
201198
; SINK-AFTER-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC]], align 4
202-
; SINK-AFTER-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i64 0
203-
; SINK-AFTER-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
204-
; SINK-AFTER-NEXT: [[TMP1:%.*]] = icmp eq <4 x i32> [[BROADCAST_SPLAT]], zeroinitializer
205-
; SINK-AFTER-NEXT: [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x i16> splat (i16 1), <4 x i16> zeroinitializer
199+
; SINK-AFTER-NEXT: [[TMP1:%.*]] = icmp eq i32 [[TMP0]], 0
200+
; SINK-AFTER-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i16 1, i16 0
206201
; SINK-AFTER-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
207202
; SINK-AFTER-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 108
208203
; SINK-AFTER-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
209204
; SINK-AFTER: [[MIDDLE_BLOCK]]:
210-
; SINK-AFTER-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
211205
; SINK-AFTER-NEXT: br label %[[SCALAR_PH:.*]]
212206
; SINK-AFTER: [[SCALAR_PH]]:
213207
; SINK-AFTER-NEXT: br label %[[LOOP:.*]]
214208
; SINK-AFTER: [[LOOP]]:
215209
; SINK-AFTER-NEXT: [[IV:%.*]] = phi i16 [ 108, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
216-
; SINK-AFTER-NEXT: [[P:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], %[[SCALAR_PH]] ], [ [[SEL:%.*]], %[[LOOP]] ]
210+
; SINK-AFTER-NEXT: [[P:%.*]] = phi i16 [ [[TMP2]], %[[SCALAR_PH]] ], [ [[SEL:%.*]], %[[LOOP]] ]
217211
; SINK-AFTER-NEXT: [[L:%.*]] = load i32, ptr [[SRC]], align 4
218212
; SINK-AFTER-NEXT: [[C:%.*]] = icmp eq i32 [[L]], 0
219213
; SINK-AFTER-NEXT: [[SEL]] = select i1 [[C]], i16 1, i16 0

0 commit comments

Comments
 (0)