Skip to content

Commit 8d90453

Browse files
committed
[VPlan] Handle stores to single-scalar addr in narrowToSingleScalars.
Move handling of stores to single-scalar/uniform address from replicateByVF to narrowToSingleScalar. (cherry picked from commit 1efa997)
1 parent b7a1ae9 commit 8d90453

22 files changed

+143
-76
lines changed

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1296,6 +1296,20 @@ static void narrowToSingleScalarRecipes(VPlan &Plan) {
12961296
continue;
12971297

12981298
auto *RepOrWidenR = cast<VPSingleDefRecipe>(&R);
1299+
if (RepR && isa<StoreInst>(RepR->getUnderlyingInstr()) &&
1300+
vputils::isSingleScalar(RepR->getOperand(1))) {
1301+
auto *Clone = new VPReplicateRecipe(
1302+
RepOrWidenR->getUnderlyingInstr(), RepOrWidenR->operands(),
1303+
true /*IsSingleScalar*/, nullptr /*Mask*/, *RepR /*Metadata*/);
1304+
Clone->insertBefore(RepOrWidenR);
1305+
auto *Ext = new VPInstruction(VPInstruction::ExtractLastElement,
1306+
{Clone->getOperand(0)});
1307+
Ext->insertBefore(Clone);
1308+
Clone->setOperand(0, Ext);
1309+
RepR->eraseFromParent();
1310+
continue;
1311+
}
1312+
12991313
// Skip recipes that aren't single scalars or don't have only their
13001314
// scalar results used. In the latter case, we would introduce extra
13011315
// broadcasts.
@@ -1988,9 +2002,8 @@ void VPlanTransforms::optimize(VPlan &Plan) {
19882002

19892003
runPass(simplifyRecipes, Plan, *Plan.getCanonicalIV()->getScalarType());
19902004
runPass(simplifyBlends, Plan);
1991-
runPass(removeDeadRecipes, Plan);
1992-
runPass(narrowToSingleScalarRecipes, Plan);
19932005
runPass(legalizeAndOptimizeInductions, Plan);
2006+
runPass(narrowToSingleScalarRecipes, Plan);
19942007
runPass(removeRedundantExpandSCEVRecipes, Plan);
19952008
runPass(simplifyRecipes, Plan, *Plan.getCanonicalIV()->getScalarType());
19962009
runPass(removeBranchOnConst, Plan);

llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp

Lines changed: 3 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -528,16 +528,9 @@ void VPlanTransforms::replicateByVF(VPlan &Plan, ElementCount VF) {
528528

529529
VPBuilder Builder(RepR);
530530
if (RepR->getNumUsers() == 0) {
531-
if (isa<StoreInst>(RepR->getUnderlyingInstr()) &&
532-
vputils::isSingleScalar(RepR->getOperand(1))) {
533-
// Stores to invariant addresses need to store the last lane only.
534-
cloneForLane(Plan, Builder, IdxTy, RepR, VPLane::getLastLaneForVF(VF),
535-
Def2LaneDefs);
536-
} else {
537-
// Create single-scalar version of RepR for all lanes.
538-
for (unsigned I = 0; I != VF.getKnownMinValue(); ++I)
539-
cloneForLane(Plan, Builder, IdxTy, RepR, VPLane(I), Def2LaneDefs);
540-
}
531+
// Create single-scalar version of RepR for all lanes.
532+
for (unsigned I = 0; I != VF.getKnownMinValue(); ++I)
533+
cloneForLane(Plan, Builder, IdxTy, RepR, VPLane(I), Def2LaneDefs);
541534
RepR->eraseFromParent();
542535
continue;
543536
}

llvm/test/Transforms/LoopVectorize/ARM/tail-folding-scalar-epilogue-fallback.ll

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,38 @@ define void @outside_user_blocks_tail_folding(ptr nocapture readonly %ptr, i32 %
2323
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
2424
; CHECK: vector.body:
2525
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
26-
; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PTR]], i32 [[INDEX]]
26+
; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[INDEX]], 0
27+
; CHECK-NEXT: [[TMP19:%.*]] = add i32 [[INDEX]], 1
28+
; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[INDEX]], 2
29+
; CHECK-NEXT: [[TMP21:%.*]] = add i32 [[INDEX]], 3
30+
; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[INDEX]], 4
31+
; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[INDEX]], 5
32+
; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[INDEX]], 6
33+
; CHECK-NEXT: [[TMP9:%.*]] = add i32 [[INDEX]], 7
34+
; CHECK-NEXT: [[TMP10:%.*]] = add i32 [[INDEX]], 8
35+
; CHECK-NEXT: [[TMP11:%.*]] = add i32 [[INDEX]], 9
36+
; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[INDEX]], 10
37+
; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[INDEX]], 11
38+
; CHECK-NEXT: [[TMP14:%.*]] = add i32 [[INDEX]], 12
39+
; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[INDEX]], 13
40+
; CHECK-NEXT: [[TMP16:%.*]] = add i32 [[INDEX]], 14
41+
; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[INDEX]], 15
42+
; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PTR]], i32 [[TMP18]]
43+
; CHECK-NEXT: [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[PTR]], i32 [[TMP19]]
44+
; CHECK-NEXT: [[NEXT_GEP2:%.*]] = getelementptr i8, ptr [[PTR]], i32 [[TMP20]]
45+
; CHECK-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[PTR]], i32 [[TMP21]]
46+
; CHECK-NEXT: [[NEXT_GEP4:%.*]] = getelementptr i8, ptr [[PTR]], i32 [[TMP6]]
47+
; CHECK-NEXT: [[NEXT_GEP5:%.*]] = getelementptr i8, ptr [[PTR]], i32 [[TMP7]]
48+
; CHECK-NEXT: [[NEXT_GEP6:%.*]] = getelementptr i8, ptr [[PTR]], i32 [[TMP8]]
49+
; CHECK-NEXT: [[NEXT_GEP7:%.*]] = getelementptr i8, ptr [[PTR]], i32 [[TMP9]]
50+
; CHECK-NEXT: [[NEXT_GEP8:%.*]] = getelementptr i8, ptr [[PTR]], i32 [[TMP10]]
51+
; CHECK-NEXT: [[NEXT_GEP9:%.*]] = getelementptr i8, ptr [[PTR]], i32 [[TMP11]]
52+
; CHECK-NEXT: [[NEXT_GEP10:%.*]] = getelementptr i8, ptr [[PTR]], i32 [[TMP12]]
53+
; CHECK-NEXT: [[NEXT_GEP11:%.*]] = getelementptr i8, ptr [[PTR]], i32 [[TMP13]]
54+
; CHECK-NEXT: [[NEXT_GEP12:%.*]] = getelementptr i8, ptr [[PTR]], i32 [[TMP14]]
55+
; CHECK-NEXT: [[NEXT_GEP13:%.*]] = getelementptr i8, ptr [[PTR]], i32 [[TMP15]]
56+
; CHECK-NEXT: [[NEXT_GEP14:%.*]] = getelementptr i8, ptr [[PTR]], i32 [[TMP16]]
57+
; CHECK-NEXT: [[NEXT_GEP15:%.*]] = getelementptr i8, ptr [[PTR]], i32 [[TMP17]]
2758
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[NEXT_GEP]], i32 1
2859
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
2960
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1

llvm/test/Transforms/LoopVectorize/RISCV/illegal-type.ll

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -102,10 +102,10 @@ define void @uniform_store_i1(ptr noalias %dst, ptr noalias %start, i64 %N) {
102102
; CHECK-LABEL: @uniform_store_i1(
103103
; CHECK-NEXT: entry:
104104
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[N:%.*]], 1
105-
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 32
105+
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 64
106106
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
107107
; CHECK: vector.ph:
108-
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 32
108+
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 64
109109
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
110110
; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[N_VEC]], 8
111111
; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[START:%.*]], i64 [[TMP1]]
@@ -116,12 +116,13 @@ define void @uniform_store_i1(ptr noalias %dst, ptr noalias %start, i64 %N) {
116116
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
117117
; CHECK-NEXT: [[POINTER_PHI:%.*]] = phi ptr [ [[START]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ]
118118
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <32 x i64> <i64 0, i64 8, i64 16, i64 24, i64 32, i64 40, i64 48, i64 56, i64 64, i64 72, i64 80, i64 88, i64 96, i64 104, i64 112, i64 120, i64 128, i64 136, i64 144, i64 152, i64 160, i64 168, i64 176, i64 184, i64 192, i64 200, i64 208, i64 216, i64 224, i64 232, i64 240, i64 248>
119-
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, <32 x ptr> [[TMP2]], i64 1
119+
; CHECK-NEXT: [[VECTOR_GEP1:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <32 x i64> <i64 256, i64 264, i64 272, i64 280, i64 288, i64 296, i64 304, i64 312, i64 320, i64 328, i64 336, i64 344, i64 352, i64 360, i64 368, i64 376, i64 384, i64 392, i64 400, i64 408, i64 416, i64 424, i64 432, i64 440, i64 448, i64 456, i64 464, i64 472, i64 480, i64 488, i64 496, i64 504>
120+
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, <32 x ptr> [[VECTOR_GEP1]], i64 1
120121
; CHECK-NEXT: [[TMP7:%.*]] = icmp eq <32 x ptr> [[TMP5]], [[BROADCAST_SPLAT]]
121122
; CHECK-NEXT: [[TMP8:%.*]] = extractelement <32 x i1> [[TMP7]], i32 31
122123
; CHECK-NEXT: store i1 [[TMP8]], ptr [[DST:%.*]], align 1
123-
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
124-
; CHECK-NEXT: [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 256
124+
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 64
125+
; CHECK-NEXT: [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 512
125126
; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
126127
; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
127128
; CHECK: middle.block:

llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f32-stride-5.ll

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,6 @@ define void @test() {
4343
; AVX512: LV: Found an estimated cost of 21 for VF 8 For instruction: store float %v4, ptr %out4, align 4
4444
; AVX512: LV: Found an estimated cost of 35 for VF 16 For instruction: store float %v4, ptr %out4, align 4
4545
; AVX512: LV: Found an estimated cost of 70 for VF 32 For instruction: store float %v4, ptr %out4, align 4
46-
; AVX512: LV: Found an estimated cost of 140 for VF 64 For instruction: store float %v4, ptr %out4, align 4
4746
;
4847
entry:
4948
br label %for.body

llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f32-stride-6.ll

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,6 @@ define void @test() {
4343
; AVX512: LV: Found an estimated cost of 25 for VF 8 For instruction: store float %v5, ptr %out5, align 4
4444
; AVX512: LV: Found an estimated cost of 51 for VF 16 For instruction: store float %v5, ptr %out5, align 4
4545
; AVX512: LV: Found an estimated cost of 102 for VF 32 For instruction: store float %v5, ptr %out5, align 4
46-
; AVX512: LV: Found an estimated cost of 204 for VF 64 For instruction: store float %v5, ptr %out5, align 4
4746
;
4847
entry:
4948
br label %for.body

llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f64-stride-3.ll

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,6 @@ define void @test() {
4343
; AVX512: LV: Found an estimated cost of 12 for VF 8 For instruction: store double %v2, ptr %out2, align 8
4444
; AVX512: LV: Found an estimated cost of 24 for VF 16 For instruction: store double %v2, ptr %out2, align 8
4545
; AVX512: LV: Found an estimated cost of 48 for VF 32 For instruction: store double %v2, ptr %out2, align 8
46-
; AVX512: LV: Found an estimated cost of 96 for VF 64 For instruction: store double %v2, ptr %out2, align 8
4746
;
4847
entry:
4948
br label %for.body

llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f64-stride-7.ll

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,6 @@ define void @test() {
3939
; AVX512: LV: Found an estimated cost of 40 for VF 4 For instruction: store double %v6, ptr %out6, align 8
4040
; AVX512: LV: Found an estimated cost of 70 for VF 8 For instruction: store double %v6, ptr %out6, align 8
4141
; AVX512: LV: Found an estimated cost of 140 for VF 16 For instruction: store double %v6, ptr %out6, align 8
42-
; AVX512: LV: Found an estimated cost of 280 for VF 32 For instruction: store double %v6, ptr %out6, align 8
4342
;
4443
entry:
4544
br label %for.body

llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i32-stride-5.ll

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,6 @@ define void @test() {
4343
; AVX512: LV: Found an estimated cost of 21 for VF 8 For instruction: store i32 %v4, ptr %out4, align 4
4444
; AVX512: LV: Found an estimated cost of 35 for VF 16 For instruction: store i32 %v4, ptr %out4, align 4
4545
; AVX512: LV: Found an estimated cost of 70 for VF 32 For instruction: store i32 %v4, ptr %out4, align 4
46-
; AVX512: LV: Found an estimated cost of 140 for VF 64 For instruction: store i32 %v4, ptr %out4, align 4
4746
;
4847
entry:
4948
br label %for.body

llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i32-stride-6.ll

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,6 @@ define void @test() {
4343
; AVX512: LV: Found an estimated cost of 25 for VF 8 For instruction: store i32 %v5, ptr %out5, align 4
4444
; AVX512: LV: Found an estimated cost of 51 for VF 16 For instruction: store i32 %v5, ptr %out5, align 4
4545
; AVX512: LV: Found an estimated cost of 102 for VF 32 For instruction: store i32 %v5, ptr %out5, align 4
46-
; AVX512: LV: Found an estimated cost of 204 for VF 64 For instruction: store i32 %v5, ptr %out5, align 4
4746
;
4847
entry:
4948
br label %for.body

0 commit comments

Comments
 (0)