Skip to content

Commit 806e2ae

Browse files
committed
[VPlan] Skip uses-scalars restriction if one of ops needs broadcast.
Update the logic in narrowToSingleScalar to allow narrowing even if not all users use scalars, if at least one of the operands already needs broadcasting. In that case, there won't be any additional broadcasts introduced. This should allow removing the special handling for stores, which can introduce additional broadcasts currently.
1 parent e009de2 commit 806e2ae

14 files changed

+345
-108
lines changed

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

Lines changed: 26 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1425,32 +1425,33 @@ static void narrowToSingleScalarRecipes(VPlan &Plan) {
14251425
continue;
14261426
}
14271427

1428-
// Skip recipes that aren't single scalars or don't have only their
1429-
// scalar results used. In the latter case, we would introduce extra
1430-
// broadcasts.
1428+
// Skip recipes that aren't single scalars or when conversion to
1429+
// single-scalar does not introduce additional broadcasts. That is, either
1430+
// only the scalars of the recipe are used, or at least one of the
1431+
// operands would require a broadcast. In the latter case, the
1432+
// single-scalar may need to be broadcasted, but another broadcast is
1433+
// removed. scalar results used. In the latter case, we would introduce
1434+
// extra broadcasts.
14311435
if (!vputils::isSingleScalar(RepOrWidenR) ||
1432-
!all_of(RepOrWidenR->users(), [RepOrWidenR](const VPUser *U) {
1433-
if (auto *Store = dyn_cast<VPWidenStoreRecipe>(U)) {
1434-
// VPWidenStore doesn't have users, and stores are always
1435-
// profitable to widen: hence, permitting address and mask
1436-
// operands, and single-scalar stored values is an important leaf
1437-
// condition. The assert must hold as we checked the RepOrWidenR
1438-
// operand against vputils::isSingleScalar.
1439-
assert(RepOrWidenR != Store->getStoredValue() ||
1440-
vputils::isSingleScalar(Store->getStoredValue()));
1441-
return true;
1442-
}
1443-
1444-
if (auto *VPI = dyn_cast<VPInstruction>(U)) {
1445-
unsigned Opcode = VPI->getOpcode();
1446-
if (Opcode == VPInstruction::ExtractLastElement ||
1447-
Opcode == VPInstruction::ExtractLastLanePerPart ||
1448-
Opcode == VPInstruction::ExtractPenultimateElement)
1449-
return true;
1450-
}
1451-
1452-
return U->usesScalars(RepOrWidenR);
1453-
}))
1436+
(!all_of(RepOrWidenR->users(),
1437+
[RepOrWidenR](const VPUser *U) {
1438+
if (auto *VPI = dyn_cast<VPInstruction>(U)) {
1439+
unsigned Opcode = VPI->getOpcode();
1440+
if (Opcode == VPInstruction::ExtractLastElement ||
1441+
Opcode == VPInstruction::ExtractLastLanePerPart ||
1442+
Opcode == VPInstruction::ExtractPenultimateElement)
1443+
return true;
1444+
}
1445+
1446+
return U->usesScalars(RepOrWidenR);
1447+
}) &&
1448+
none_of(RepOrWidenR->operands(), [RepOrWidenR](VPValue *Op) {
1449+
return Op->getSingleUser() == RepOrWidenR &&
1450+
((Op->isLiveIn() &&
1451+
!isa<Constant>(Op->getLiveInIRValue())) ||
1452+
(isa<VPReplicateRecipe>(Op) &&
1453+
cast<VPReplicateRecipe>(Op)->isSingleScalar()));
1454+
})))
14541455
continue;
14551456

14561457
auto *Clone = new VPReplicateRecipe(RepOrWidenR->getUnderlyingInstr(),

llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -531,10 +531,10 @@ define void @multiple_exit_conditions(ptr %src, ptr noalias %dst) #1 {
531531
; DEFAULT-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 8
532532
; DEFAULT-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[OFFSET_IDX]]
533533
; DEFAULT-NEXT: [[TMP1:%.*]] = load i16, ptr [[SRC]], align 2
534-
; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i16> poison, i16 [[TMP1]], i64 0
534+
; DEFAULT-NEXT: [[TMP2:%.*]] = or i16 [[TMP1]], 1
535+
; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i16> poison, i16 [[TMP2]], i64 0
535536
; DEFAULT-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT]], <8 x i16> poison, <8 x i32> zeroinitializer
536-
; DEFAULT-NEXT: [[TMP2:%.*]] = or <8 x i16> [[BROADCAST_SPLAT]], splat (i16 1)
537-
; DEFAULT-NEXT: [[TMP3:%.*]] = uitofp <8 x i16> [[TMP2]] to <8 x double>
537+
; DEFAULT-NEXT: [[TMP3:%.*]] = uitofp <8 x i16> [[BROADCAST_SPLAT]] to <8 x double>
538538
; DEFAULT-NEXT: store <8 x double> [[TMP3]], ptr [[NEXT_GEP]], align 8
539539
; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
540540
; DEFAULT-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
@@ -563,10 +563,10 @@ define void @multiple_exit_conditions(ptr %src, ptr noalias %dst) #1 {
563563
; PRED-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 8
564564
; PRED-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[OFFSET_IDX]]
565565
; PRED-NEXT: [[TMP12:%.*]] = load i16, ptr [[SRC]], align 2
566-
; PRED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i16> poison, i16 [[TMP12]], i64 0
566+
; PRED-NEXT: [[TMP11:%.*]] = or i16 [[TMP12]], 1
567+
; PRED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i16> poison, i16 [[TMP11]], i64 0
567568
; PRED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i16> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer
568-
; PRED-NEXT: [[TMP13:%.*]] = or <vscale x 2 x i16> [[BROADCAST_SPLAT]], splat (i16 1)
569-
; PRED-NEXT: [[TMP14:%.*]] = uitofp <vscale x 2 x i16> [[TMP13]] to <vscale x 2 x double>
569+
; PRED-NEXT: [[TMP14:%.*]] = uitofp <vscale x 2 x i16> [[BROADCAST_SPLAT]] to <vscale x 2 x double>
570570
; PRED-NEXT: call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP14]], ptr align 8 [[NEXT_GEP]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
571571
; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
572572
; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 [[TMP10]])
@@ -672,10 +672,10 @@ define void @test_conditional_interleave_group (ptr noalias %src.1, ptr noalias
672672
; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <8 x float> poison, float [[TMP15]], i64 0
673673
; DEFAULT-NEXT: [[BROADCAST_SPLAT9:%.*]] = shufflevector <8 x float> [[BROADCAST_SPLATINSERT8]], <8 x float> poison, <8 x i32> zeroinitializer
674674
; DEFAULT-NEXT: [[TMP16:%.*]] = load float, ptr [[SRC_2]], align 4
675-
; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x float> poison, float [[TMP16]], i64 0
675+
; DEFAULT-NEXT: [[TMP17:%.*]] = fmul float [[TMP16]], 0.000000e+00
676+
; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x float> poison, float [[TMP17]], i64 0
676677
; DEFAULT-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x float> [[BROADCAST_SPLATINSERT]], <8 x float> poison, <8 x i32> zeroinitializer
677-
; DEFAULT-NEXT: [[TMP17:%.*]] = fmul <8 x float> [[BROADCAST_SPLAT]], zeroinitializer
678-
; DEFAULT-NEXT: [[TMP18:%.*]] = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> [[BROADCAST_SPLAT9]], <8 x float> zeroinitializer, <8 x float> [[TMP17]])
678+
; DEFAULT-NEXT: [[TMP18:%.*]] = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> [[BROADCAST_SPLAT9]], <8 x float> zeroinitializer, <8 x float> [[BROADCAST_SPLAT]])
679679
; DEFAULT-NEXT: [[TMP19:%.*]] = load float, ptr [[SRC_3]], align 4
680680
; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT10:%.*]] = insertelement <8 x float> poison, float [[TMP19]], i64 0
681681
; DEFAULT-NEXT: [[BROADCAST_SPLAT11:%.*]] = shufflevector <8 x float> [[BROADCAST_SPLATINSERT10]], <8 x float> poison, <8 x i32> zeroinitializer
@@ -857,10 +857,10 @@ define void @test_conditional_interleave_group (ptr noalias %src.1, ptr noalias
857857
; PRED-NEXT: [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <8 x float> poison, float [[TMP18]], i64 0
858858
; PRED-NEXT: [[BROADCAST_SPLAT9:%.*]] = shufflevector <8 x float> [[BROADCAST_SPLATINSERT8]], <8 x float> poison, <8 x i32> zeroinitializer
859859
; PRED-NEXT: [[TMP19:%.*]] = load float, ptr [[SRC_2]], align 4
860-
; PRED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x float> poison, float [[TMP19]], i64 0
860+
; PRED-NEXT: [[TMP20:%.*]] = fmul float [[TMP19]], 0.000000e+00
861+
; PRED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x float> poison, float [[TMP20]], i64 0
861862
; PRED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x float> [[BROADCAST_SPLATINSERT]], <8 x float> poison, <8 x i32> zeroinitializer
862-
; PRED-NEXT: [[TMP20:%.*]] = fmul <8 x float> [[BROADCAST_SPLAT]], zeroinitializer
863-
; PRED-NEXT: [[TMP21:%.*]] = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> [[BROADCAST_SPLAT9]], <8 x float> zeroinitializer, <8 x float> [[TMP20]])
863+
; PRED-NEXT: [[TMP21:%.*]] = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> [[BROADCAST_SPLAT9]], <8 x float> zeroinitializer, <8 x float> [[BROADCAST_SPLAT]])
864864
; PRED-NEXT: [[TMP22:%.*]] = load float, ptr [[SRC_3]], align 4
865865
; PRED-NEXT: [[BROADCAST_SPLATINSERT10:%.*]] = insertelement <8 x float> poison, float [[TMP22]], i64 0
866866
; PRED-NEXT: [[BROADCAST_SPLAT11:%.*]] = shufflevector <8 x float> [[BROADCAST_SPLATINSERT10]], <8 x float> poison, <8 x i32> zeroinitializer

llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -59,16 +59,16 @@ define i32 @chained_recurrences(i32 %x, i64 %y, ptr %src.1, i32 %z, ptr %src.2)
5959
; VSCALEFORTUNING2-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 8
6060
; VSCALEFORTUNING2-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP4]]
6161
; VSCALEFORTUNING2-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
62-
; VSCALEFORTUNING2-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[Z]], i64 0
63-
; VSCALEFORTUNING2-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
6462
; VSCALEFORTUNING2-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[X]], i64 0
6563
; VSCALEFORTUNING2-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
6664
; VSCALEFORTUNING2-NEXT: [[TMP7:%.*]] = add i64 [[Y]], 1
6765
; VSCALEFORTUNING2-NEXT: [[TMP8:%.*]] = getelementptr i32, ptr [[SRC_1]], i64 [[TMP7]]
6866
; VSCALEFORTUNING2-NEXT: [[TMP9:%.*]] = lshr <vscale x 4 x i32> [[BROADCAST_SPLAT]], splat (i32 1)
6967
; VSCALEFORTUNING2-NEXT: [[TMP10:%.*]] = shl <vscale x 4 x i32> [[BROADCAST_SPLAT]], splat (i32 1)
7068
; VSCALEFORTUNING2-NEXT: [[TMP11:%.*]] = or <vscale x 4 x i32> [[TMP9]], [[TMP10]]
71-
; VSCALEFORTUNING2-NEXT: [[TMP12:%.*]] = or <vscale x 4 x i32> [[BROADCAST_SPLAT2]], [[BROADCAST_SPLAT]]
69+
; VSCALEFORTUNING2-NEXT: [[TMP16:%.*]] = or i32 [[Z]], [[X]]
70+
; VSCALEFORTUNING2-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP16]], i64 0
71+
; VSCALEFORTUNING2-NEXT: [[TMP12:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
7272
; VSCALEFORTUNING2-NEXT: [[TMP13:%.*]] = and <vscale x 4 x i32> [[TMP12]], splat (i32 1)
7373
; VSCALEFORTUNING2-NEXT: [[TMP14:%.*]] = xor <vscale x 4 x i32> [[TMP13]], splat (i32 1)
7474
; VSCALEFORTUNING2-NEXT: [[TMP15:%.*]] = zext <vscale x 4 x i32> [[TMP14]] to <vscale x 4 x i64>
@@ -180,8 +180,6 @@ define i32 @chained_recurrences(i32 %x, i64 %y, ptr %src.1, i32 %z, ptr %src.2)
180180
; PRED: [[VECTOR_PH]]:
181181
; PRED-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
182182
; PRED-NEXT: [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
183-
; PRED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[Z]], i64 0
184-
; PRED-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
185183
; PRED-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[X]], i64 0
186184
; PRED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
187185
; PRED-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
@@ -195,7 +193,9 @@ define i32 @chained_recurrences(i32 %x, i64 %y, ptr %src.1, i32 %z, ptr %src.2)
195193
; PRED-NEXT: [[TMP13:%.*]] = lshr <vscale x 4 x i32> [[BROADCAST_SPLAT]], splat (i32 1)
196194
; PRED-NEXT: [[TMP14:%.*]] = shl <vscale x 4 x i32> [[BROADCAST_SPLAT]], splat (i32 1)
197195
; PRED-NEXT: [[TMP15:%.*]] = or <vscale x 4 x i32> [[TMP13]], [[TMP14]]
198-
; PRED-NEXT: [[TMP16:%.*]] = or <vscale x 4 x i32> [[BROADCAST_SPLAT2]], [[BROADCAST_SPLAT]]
196+
; PRED-NEXT: [[TMP20:%.*]] = or i32 [[Z]], [[X]]
197+
; PRED-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP20]], i64 0
198+
; PRED-NEXT: [[TMP16:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT2]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
199199
; PRED-NEXT: [[TMP17:%.*]] = and <vscale x 4 x i32> [[TMP16]], splat (i32 1)
200200
; PRED-NEXT: [[TMP18:%.*]] = xor <vscale x 4 x i32> [[TMP17]], splat (i32 1)
201201
; PRED-NEXT: [[TMP19:%.*]] = zext <vscale x 4 x i32> [[TMP18]] to <vscale x 4 x i64>

llvm/test/Transforms/LoopVectorize/RISCV/pr88802.ll

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,10 +9,10 @@ define void @test(ptr %p, i64 %a, i8 %b) {
99
; CHECK: vector.ph:
1010
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i8> poison, i8 [[B]], i64 0
1111
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i8> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer
12-
; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[A]], i64 0
12+
; CHECK-NEXT: [[TMP0:%.*]] = shl i64 [[A]], 48
13+
; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP0]], i64 0
1314
; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
14-
; CHECK-NEXT: [[TMP5:%.*]] = shl <vscale x 2 x i64> [[BROADCAST_SPLAT2]], splat (i64 48)
15-
; CHECK-NEXT: [[TMP6:%.*]] = ashr <vscale x 2 x i64> [[TMP5]], splat (i64 52)
15+
; CHECK-NEXT: [[TMP6:%.*]] = ashr <vscale x 2 x i64> [[BROADCAST_SPLAT2]], splat (i64 52)
1616
; CHECK-NEXT: [[TMP7:%.*]] = trunc <vscale x 2 x i64> [[TMP6]] to <vscale x 2 x i32>
1717
; CHECK-NEXT: [[TMP8:%.*]] = zext <vscale x 2 x i8> [[BROADCAST_SPLAT]] to <vscale x 2 x i32>
1818
; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 2 x ptr> poison, ptr [[P]], i64 0

0 commit comments

Comments
 (0)