Skip to content

Commit 1efa997

Browse files
committed
[VPlan] Handle stores to single-scalar addr in narrowToSingleScalars.
Move handling of stores to single-scalar/uniform address from replicateByVF to narrowToSingleScalar.
1 parent 701b839 commit 1efa997

File tree

5 files changed

+41
-34
lines changed

5 files changed

+41
-34
lines changed

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1308,6 +1308,20 @@ static void narrowToSingleScalarRecipes(VPlan &Plan) {
13081308
continue;
13091309

13101310
auto *RepOrWidenR = cast<VPSingleDefRecipe>(&R);
1311+
if (RepR && isa<StoreInst>(RepR->getUnderlyingInstr()) &&
1312+
vputils::isSingleScalar(RepR->getOperand(1))) {
1313+
auto *Clone = new VPReplicateRecipe(
1314+
RepOrWidenR->getUnderlyingInstr(), RepOrWidenR->operands(),
1315+
true /*IsSingleScalar*/, nullptr /*Mask*/, *RepR /*Metadata*/);
1316+
Clone->insertBefore(RepOrWidenR);
1317+
auto *Ext = new VPInstruction(VPInstruction::ExtractLastElement,
1318+
{Clone->getOperand(0)});
1319+
Ext->insertBefore(Clone);
1320+
Clone->setOperand(0, Ext);
1321+
RepR->eraseFromParent();
1322+
continue;
1323+
}
1324+
13111325
// Skip recipes that aren't single scalars or don't have only their
13121326
// scalar results used. In the latter case, we would introduce extra
13131327
// broadcasts.
@@ -2217,8 +2231,8 @@ void VPlanTransforms::optimize(VPlan &Plan) {
22172231
runPass(simplifyRecipes, Plan);
22182232
runPass(removeDeadRecipes, Plan);
22192233
runPass(simplifyBlends, Plan);
2220-
runPass(narrowToSingleScalarRecipes, Plan);
22212234
runPass(legalizeAndOptimizeInductions, Plan);
2235+
runPass(narrowToSingleScalarRecipes, Plan);
22222236
runPass(removeRedundantExpandSCEVRecipes, Plan);
22232237
runPass(simplifyRecipes, Plan);
22242238
runPass(removeBranchOnConst, Plan);

llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp

Lines changed: 3 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -536,16 +536,9 @@ void VPlanTransforms::replicateByVF(VPlan &Plan, ElementCount VF) {
536536

537537
VPBuilder Builder(RepR);
538538
if (RepR->getNumUsers() == 0) {
539-
if (isa<StoreInst>(RepR->getUnderlyingInstr()) &&
540-
vputils::isSingleScalar(RepR->getOperand(1))) {
541-
// Stores to invariant addresses need to store the last lane only.
542-
cloneForLane(Plan, Builder, IdxTy, RepR, VPLane::getLastLaneForVF(VF),
543-
Def2LaneDefs);
544-
} else {
545-
// Create single-scalar version of RepR for all lanes.
546-
for (unsigned I = 0; I != VF.getKnownMinValue(); ++I)
547-
cloneForLane(Plan, Builder, IdxTy, RepR, VPLane(I), Def2LaneDefs);
548-
}
539+
// Create single-scalar version of RepR for all lanes.
540+
for (unsigned I = 0; I != VF.getKnownMinValue(); ++I)
541+
cloneForLane(Plan, Builder, IdxTy, RepR, VPLane(I), Def2LaneDefs);
549542
RepR->eraseFromParent();
550543
continue;
551544
}

llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll

Lines changed: 20 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -226,11 +226,10 @@ define i64 @test_pr62954_scalar_epilogue_required(ptr %A, ptr noalias %B, ptr %C
226226
; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 36
227227
; CHECK-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
228228
; CHECK: middle.block:
229-
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
230229
; CHECK-NEXT: br label [[SCALAR_PH]]
231230
; CHECK: scalar.ph:
232231
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 73, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ]
233-
; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[REC_START]], [[ENTRY]] ]
232+
; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ [[TMP2]], [[MIDDLE_BLOCK]] ], [ [[REC_START]], [[ENTRY]] ]
234233
; CHECK-NEXT: br label [[LOOP:%.*]]
235234
; CHECK: loop:
236235
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
@@ -364,28 +363,28 @@ define void @test_for_tried_to_force_scalar(ptr noalias %A, ptr noalias %B, ptr
364363
; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 1
365364
; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 2
366365
; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 3
367-
; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 4
368-
; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 5
369-
; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 6
370-
; CHECK-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 7
371-
; CHECK-NEXT: [[TMP13:%.*]] = getelementptr nusw [3 x float], ptr [[A:%.*]], i64 [[TMP5]]
372-
; CHECK-NEXT: [[TMP14:%.*]] = getelementptr nusw [3 x float], ptr [[A]], i64 [[TMP6]]
373-
; CHECK-NEXT: [[TMP15:%.*]] = getelementptr nusw [3 x float], ptr [[A]], i64 [[TMP7]]
374-
; CHECK-NEXT: [[TMP16:%.*]] = getelementptr nusw [3 x float], ptr [[A]], i64 [[TMP8]]
375-
; CHECK-NEXT: [[TMP17:%.*]] = insertelement <4 x ptr> poison, ptr [[TMP13]], i32 0
376-
; CHECK-NEXT: [[TMP18:%.*]] = insertelement <4 x ptr> [[TMP17]], ptr [[TMP14]], i32 1
377-
; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x ptr> [[TMP18]], ptr [[TMP15]], i32 2
378-
; CHECK-NEXT: [[TMP20:%.*]] = insertelement <4 x ptr> [[TMP19]], ptr [[TMP16]], i32 3
379-
; CHECK-NEXT: [[TMP21:%.*]] = getelementptr nusw [3 x float], ptr [[A]], i64 [[TMP9]]
380-
; CHECK-NEXT: [[TMP22:%.*]] = getelementptr nusw [3 x float], ptr [[A]], i64 [[TMP10]]
381-
; CHECK-NEXT: [[TMP23:%.*]] = getelementptr nusw [3 x float], ptr [[A]], i64 [[TMP11]]
382-
; CHECK-NEXT: [[TMP24:%.*]] = getelementptr nusw [3 x float], ptr [[A]], i64 [[TMP12]]
366+
; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 4
367+
; CHECK-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 5
368+
; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 6
369+
; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 7
370+
; CHECK-NEXT: [[TMP21:%.*]] = getelementptr nusw [3 x float], ptr [[A:%.*]], i64 [[TMP5]]
371+
; CHECK-NEXT: [[TMP22:%.*]] = getelementptr nusw [3 x float], ptr [[A]], i64 [[TMP6]]
372+
; CHECK-NEXT: [[TMP23:%.*]] = getelementptr nusw [3 x float], ptr [[A]], i64 [[TMP7]]
373+
; CHECK-NEXT: [[TMP24:%.*]] = getelementptr nusw [3 x float], ptr [[A]], i64 [[TMP8]]
383374
; CHECK-NEXT: [[TMP25:%.*]] = insertelement <4 x ptr> poison, ptr [[TMP21]], i32 0
384375
; CHECK-NEXT: [[TMP26:%.*]] = insertelement <4 x ptr> [[TMP25]], ptr [[TMP22]], i32 1
385376
; CHECK-NEXT: [[TMP27:%.*]] = insertelement <4 x ptr> [[TMP26]], ptr [[TMP23]], i32 2
386377
; CHECK-NEXT: [[TMP28:%.*]] = insertelement <4 x ptr> [[TMP27]], ptr [[TMP24]], i32 3
387-
; CHECK-NEXT: [[TMP29:%.*]] = shufflevector <4 x ptr> [[TMP20]], <4 x ptr> [[TMP28]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
388-
; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <12 x float>, ptr [[TMP21]], align 4
378+
; CHECK-NEXT: [[TMP19:%.*]] = getelementptr nusw [3 x float], ptr [[A]], i64 [[TMP11]]
379+
; CHECK-NEXT: [[TMP20:%.*]] = getelementptr nusw [3 x float], ptr [[A]], i64 [[TMP12]]
380+
; CHECK-NEXT: [[TMP31:%.*]] = getelementptr nusw [3 x float], ptr [[A]], i64 [[TMP9]]
381+
; CHECK-NEXT: [[TMP32:%.*]] = getelementptr nusw [3 x float], ptr [[A]], i64 [[TMP10]]
382+
; CHECK-NEXT: [[TMP33:%.*]] = insertelement <4 x ptr> poison, ptr [[TMP19]], i32 0
383+
; CHECK-NEXT: [[TMP34:%.*]] = insertelement <4 x ptr> [[TMP33]], ptr [[TMP20]], i32 1
384+
; CHECK-NEXT: [[TMP35:%.*]] = insertelement <4 x ptr> [[TMP34]], ptr [[TMP31]], i32 2
385+
; CHECK-NEXT: [[TMP38:%.*]] = insertelement <4 x ptr> [[TMP35]], ptr [[TMP32]], i32 3
386+
; CHECK-NEXT: [[TMP29:%.*]] = shufflevector <4 x ptr> [[TMP28]], <4 x ptr> [[TMP38]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
387+
; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <12 x float>, ptr [[TMP19]], align 4
389388
; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <12 x float> [[WIDE_VEC]], <12 x float> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
390389
; CHECK-NEXT: [[TMP30:%.*]] = extractelement <4 x float> [[STRIDED_VEC]], i32 3
391390
; CHECK-NEXT: store float [[TMP30]], ptr [[C:%.*]], align 4
@@ -399,7 +398,7 @@ define void @test_for_tried_to_force_scalar(ptr noalias %A, ptr noalias %B, ptr
399398
; CHECK-NEXT: br label [[SCALAR_PH]]
400399
; CHECK: scalar.ph:
401400
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
402-
; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi ptr [ [[TMP24]], [[MIDDLE_BLOCK]] ], [ [[A]], [[ENTRY]] ]
401+
; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi ptr [ [[TMP32]], [[MIDDLE_BLOCK]] ], [ [[A]], [[ENTRY]] ]
403402
; CHECK-NEXT: br label [[LOOP:%.*]]
404403
; CHECK: loop:
405404
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]

llvm/test/Transforms/LoopVectorize/single-scalar-cast-minbw.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,10 +19,10 @@ define void @minbw_cast(ptr %dst, i64 %n, i1 %bool1, i1 %bool2) {
1919
; CHECK-NEXT: [[TMP0:%.*]] = trunc <4 x i32> [[BROADCAST_SPLAT2]] to <4 x i8>
2020
; CHECK-NEXT: [[TMP1:%.*]] = zext <4 x i1> [[BROADCAST_SPLAT]] to <4 x i8>
2121
; CHECK-NEXT: [[TMP2:%.*]] = xor <4 x i8> [[TMP0]], [[TMP1]]
22+
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i8> [[TMP2]], i32 3
2223
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
2324
; CHECK: [[VECTOR_BODY]]:
2425
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
25-
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i8> [[TMP2]], i32 3
2626
; CHECK-NEXT: store i8 [[TMP3]], ptr [[DST]], align 1
2727
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
2828
; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]

llvm/test/Transforms/LoopVectorize/vplan-printing.ll

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -903,7 +903,8 @@ define void @zext_nneg(ptr noalias %p, ptr noalias %p1) {
903903
; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%idx>
904904
; CHECK-NEXT: WIDEN ir<%l> = load vp<[[VEC_PTR]]>
905905
; CHECK-NEXT: WIDEN-CAST ir<%zext> = zext nneg ir<%l>
906-
; CHECK-NEXT: REPLICATE store ir<%zext>, ir<%p1>
906+
; CHECK-NEXT: EMIT vp<[[EXT:%.+]]> = extract-last-element ir<%zext>
907+
; CHECK-NEXT: CLONE store vp<[[EXT]]>, ir<%p1>
907908
; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>
908909
; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VTC]]>
909910
; CHECK-NEXT: No successors

0 commit comments

Comments
 (0)