Skip to content

Commit a9c9a52

Browse files
committed
Ignore in-loop reductions
1 parent 170f6f2 commit a9c9a52

File tree

3 files changed

+54
-51
lines changed

3 files changed

+54
-51
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 19 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1022,6 +1022,12 @@ class LoopVectorizationCostModel {
10221022
SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
10231023
};
10241024

1025+
/// \return Returns information about the register usages of the loop for the
1026+
/// given plan and vectorization factors.
1027+
SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
1028+
calculateRegisterUsage(VPlan &Plan, ArrayRef<ElementCount> VFs,
1029+
const TargetTransformInfo &TTI);
1030+
10251031
/// Collect values we want to ignore in the cost model.
10261032
void collectValuesToIgnore();
10271033

@@ -4861,9 +4867,9 @@ void LoopVectorizationCostModel::collectElementTypesForWidening() {
48614867

48624868
/// Estimate the register usage for \p Plan and vectorization factors in \p VFs.
48634869
/// Returns the register usage for each VF in \p VFs.
4864-
static SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
4865-
calculateRegisterUsage(VPlan &Plan, ArrayRef<ElementCount> VFs,
4866-
const TargetTransformInfo &TTI) {
4870+
SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
4871+
LoopVectorizationCostModel::calculateRegisterUsage(
4872+
VPlan &Plan, ArrayRef<ElementCount> VFs, const TargetTransformInfo &TTI) {
48674873
// This function calculates the register usage by measuring the highest number
48684874
// of values that are alive at a single location. Obviously, this is a very
48694875
// rough estimation. We scan the loop in a topological order in order and
@@ -5003,6 +5009,12 @@ calculateRegisterUsage(VPlan &Plan, ArrayRef<ElementCount> VFs,
50035009
for (auto *R : OpenIntervals) {
50045010
if (isa<VPVectorPointerRecipe, VPVectorEndPointerRecipe>(R))
50055011
continue;
5012+
if (auto *Phi = dyn_cast<VPReductionPHIRecipe>(R);
5013+
Phi && Phi->getUnderlyingInstr()) {
5014+
if (auto *PhiNode = dyn_cast<PHINode>(Phi->getUnderlyingInstr());
5015+
PhiNode && isInLoopReduction(PhiNode))
5016+
continue;
5017+
}
50065018
if (isa<VPCanonicalIVPHIRecipe, VPReplicateRecipe, VPDerivedIVRecipe,
50075019
VPScalarIVStepsRecipe>(R) ||
50085020
(isa<VPInstruction>(R) &&
@@ -5147,7 +5159,7 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF,
51475159
return 1;
51485160
}
51495161

5150-
RegisterUsage R = ::calculateRegisterUsage(Plan, {VF}, TTI)[0];
5162+
RegisterUsage R = calculateRegisterUsage(Plan, {VF}, TTI)[0];
51515163
// We divide by these constants so assume that we have at least one
51525164
// instruction that uses at least one register.
51535165
for (auto &Pair : R.MaxLocalUsers) {
@@ -7546,7 +7558,7 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
75467558

75477559
for (auto &P : VPlans) {
75487560
SmallVector<ElementCount, 1> VFs(P->vectorFactors());
7549-
auto RUs = ::calculateRegisterUsage(*P, VFs, TTI);
7561+
auto RUs = CM.calculateRegisterUsage(*P, VFs, TTI);
75507562
for (unsigned I = 0; I < VFs.size(); I++) {
75517563
auto VF = VFs[I];
75527564
if (VF.isScalar())
@@ -7597,8 +7609,8 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
75977609
SmallVector<ElementCount, 1> VFs = {BestFactor.Width};
75987610

75997611
auto LegacyRUs =
7600-
calculateRegisterUsage(getPlanFor(LegacyVF.Width), LegacyVFs, TTI);
7601-
auto RUs = calculateRegisterUsage(BestPlan, VFs, TTI);
7612+
CM.calculateRegisterUsage(getPlanFor(LegacyVF.Width), LegacyVFs, TTI);
7613+
auto RUs = CM.calculateRegisterUsage(BestPlan, VFs, TTI);
76027614

76037615
auto GetMaxUsage = [](SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers) {
76047616
unsigned Max = 0;

llvm/test/Transforms/LoopVectorize/ARM/mve-reductions.ll

Lines changed: 26 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -335,20 +335,20 @@ define i32 @add_i8_i32(ptr nocapture readonly %x, i32 %n) #0 {
335335
; CHECK-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0
336336
; CHECK-NEXT: br i1 [[CMP6]], label [[VECTOR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]]
337337
; CHECK: vector.ph:
338-
; CHECK-NEXT: [[N_RND_UP:%.*]] = add nuw i32 [[N]], 7
339-
; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -8
338+
; CHECK-NEXT: [[N_RND_UP:%.*]] = add nuw i32 [[N]], 15
339+
; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -16
340340
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
341341
; CHECK: vector.body:
342342
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
343343
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
344-
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 [[INDEX]], i32 [[N]])
344+
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 [[INDEX]], i32 [[N]])
345345
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[X:%.*]], i32 [[INDEX]]
346-
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr [[TMP0]], i32 1, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i8> poison)
347-
; CHECK-NEXT: [[TMP1:%.*]] = zext <8 x i8> [[WIDE_MASKED_LOAD]] to <8 x i32>
348-
; CHECK-NEXT: [[TMP2:%.*]] = select <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i32> [[TMP1]], <8 x i32> zeroinitializer
349-
; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP2]])
346+
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP0]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
347+
; CHECK-NEXT: [[TMP1:%.*]] = zext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i32>
348+
; CHECK-NEXT: [[TMP2:%.*]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i32> [[TMP1]], <16 x i32> zeroinitializer
349+
; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP2]])
350350
; CHECK-NEXT: [[TMP4]] = add i32 [[TMP3]], [[VEC_PHI]]
351-
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
351+
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16
352352
; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
353353
; CHECK-NEXT: br i1 [[TMP5]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
354354
; CHECK: for.cond.cleanup:
@@ -1403,21 +1403,21 @@ define i32 @mla_i8_i32_multiuse(ptr nocapture readonly %x, ptr nocapture readonl
14031403
; CHECK-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
14041404
; CHECK-NEXT: br i1 [[CMP9]], label [[VECTOR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]]
14051405
; CHECK: vector.ph:
1406-
; CHECK-NEXT: [[N_RND_UP:%.*]] = add nuw i32 [[N]], 7
1407-
; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -8
1406+
; CHECK-NEXT: [[N_RND_UP:%.*]] = add nuw i32 [[N]], 15
1407+
; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -16
14081408
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
14091409
; CHECK: vector.body:
14101410
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
14111411
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
1412-
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 [[INDEX]], i32 [[N]])
1412+
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 [[INDEX]], i32 [[N]])
14131413
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[X:%.*]], i32 [[INDEX]]
1414-
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr [[TMP0]], i32 1, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i8> poison)
1415-
; CHECK-NEXT: [[TMP1:%.*]] = zext <8 x i8> [[WIDE_MASKED_LOAD]] to <8 x i32>
1416-
; CHECK-NEXT: [[TMP2:%.*]] = mul nuw nsw <8 x i32> [[TMP1]], [[TMP1]]
1417-
; CHECK-NEXT: [[TMP3:%.*]] = select <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i32> [[TMP2]], <8 x i32> zeroinitializer
1418-
; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP3]])
1414+
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP0]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
1415+
; CHECK-NEXT: [[TMP1:%.*]] = zext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i32>
1416+
; CHECK-NEXT: [[TMP2:%.*]] = mul nuw nsw <16 x i32> [[TMP1]], [[TMP1]]
1417+
; CHECK-NEXT: [[TMP3:%.*]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i32> [[TMP2]], <16 x i32> zeroinitializer
1418+
; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP3]])
14191419
; CHECK-NEXT: [[TMP5]] = add i32 [[TMP4]], [[VEC_PHI]]
1420-
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
1420+
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16
14211421
; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
14221422
; CHECK-NEXT: br i1 [[TMP6]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]]
14231423
; CHECK: for.cond.cleanup:
@@ -1519,25 +1519,25 @@ define i64 @mla_and_add_together_16_64(ptr nocapture noundef readonly %x, i32 no
15191519
; CHECK-NEXT: entry:
15201520
; CHECK-NEXT: [[CMP16:%.*]] = icmp sgt i32 [[N:%.*]], 0
15211521
; CHECK-NEXT: tail call void @llvm.assume(i1 [[CMP16]])
1522-
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp samesign ult i32 [[N]], 4
1522+
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp samesign ult i32 [[N]], 8
15231523
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
15241524
; CHECK: vector.ph:
1525-
; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N]], 2147483644
1525+
; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N]], 2147483640
15261526
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
15271527
; CHECK: vector.body:
15281528
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
15291529
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
15301530
; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
15311531
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[INDEX]]
1532-
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[TMP0]], align 2
1533-
; CHECK-NEXT: [[TMP1:%.*]] = sext <4 x i16> [[WIDE_LOAD]] to <4 x i32>
1534-
; CHECK-NEXT: [[TMP2:%.*]] = mul nsw <4 x i32> [[TMP1]], [[TMP1]]
1535-
; CHECK-NEXT: [[TMP3:%.*]] = zext nneg <4 x i32> [[TMP2]] to <4 x i64>
1536-
; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP3]])
1532+
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP0]], align 2
1533+
; CHECK-NEXT: [[TMP1:%.*]] = sext <8 x i16> [[WIDE_LOAD]] to <8 x i32>
1534+
; CHECK-NEXT: [[TMP2:%.*]] = mul nsw <8 x i32> [[TMP1]], [[TMP1]]
1535+
; CHECK-NEXT: [[TMP3:%.*]] = zext nneg <8 x i32> [[TMP2]] to <8 x i64>
1536+
; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP3]])
15371537
; CHECK-NEXT: [[TMP5]] = add i64 [[TMP4]], [[VEC_PHI]]
1538-
; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP1]])
1538+
; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP1]])
15391539
; CHECK-NEXT: [[TMP7]] = add i32 [[TMP6]], [[VEC_PHI1]]
1540-
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
1540+
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
15411541
; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
15421542
; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP35:![0-9]+]]
15431543
; CHECK: middle.block:

llvm/test/Transforms/PhaseOrdering/ARM/arm_mean_q7.ll

Lines changed: 9 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -35,25 +35,16 @@ define void @arm_mean_q7(ptr noundef %pSrc, i32 noundef %blockSize, ptr noundef
3535
; CHECK-NEXT: [[AND:%.*]] = and i32 [[BLOCKSIZE]], 15
3636
; CHECK-NEXT: [[CMP2_NOT15:%.*]] = icmp eq i32 [[AND]], 0
3737
; CHECK-NEXT: br i1 [[CMP2_NOT15]], label [[WHILE_END5:%.*]], label [[MIDDLE_BLOCK:%.*]]
38-
; CHECK: vector.ph:
39-
; CHECK-NEXT: [[N_RND_UP:%.*]] = add nuw nsw i32 [[AND]], 7
40-
; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], 24
41-
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
42-
; CHECK: vector.body:
43-
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[MIDDLE_BLOCK]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
44-
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ [[SUM_0_LCSSA]], [[MIDDLE_BLOCK]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
45-
; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PSRC_ADDR_0_LCSSA]], i32 [[INDEX]]
46-
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = tail call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 [[INDEX]], i32 [[AND]])
47-
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = tail call <8 x i8> @llvm.masked.load.v8i8.p0(ptr [[NEXT_GEP]], i32 1, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i8> poison)
48-
; CHECK-NEXT: [[TMP4:%.*]] = sext <8 x i8> [[WIDE_MASKED_LOAD]] to <8 x i32>
49-
; CHECK-NEXT: [[TMP5:%.*]] = select <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i32> [[TMP4]], <8 x i32> zeroinitializer
50-
; CHECK-NEXT: [[TMP6:%.*]] = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP5]])
51-
; CHECK-NEXT: [[TMP7]] = add i32 [[TMP6]], [[VEC_PHI]]
52-
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
53-
; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
54-
; CHECK-NEXT: br i1 [[TMP8]], label [[WHILE_END5]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
38+
; CHECK: middle.block:
39+
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = tail call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 0, i32 [[AND]])
40+
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = tail call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[PSRC_ADDR_0_LCSSA]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
41+
; CHECK-NEXT: [[TMP4:%.*]] = sext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i32>
42+
; CHECK-NEXT: [[TMP5:%.*]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i32> [[TMP4]], <16 x i32> zeroinitializer
43+
; CHECK-NEXT: [[TMP6:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP5]])
44+
; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[TMP6]], [[SUM_0_LCSSA]]
45+
; CHECK-NEXT: br label [[WHILE_END5]]
5546
; CHECK: while.end5:
56-
; CHECK-NEXT: [[SUM_1_LCSSA:%.*]] = phi i32 [ [[SUM_0_LCSSA]], [[WHILE_END]] ], [ [[TMP7]], [[VECTOR_BODY]] ]
47+
; CHECK-NEXT: [[SUM_1_LCSSA:%.*]] = phi i32 [ [[SUM_0_LCSSA]], [[WHILE_END]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
5748
; CHECK-NEXT: [[DIV:%.*]] = sdiv i32 [[SUM_1_LCSSA]], [[BLOCKSIZE]]
5849
; CHECK-NEXT: [[CONV6:%.*]] = trunc i32 [[DIV]] to i8
5950
; CHECK-NEXT: store i8 [[CONV6]], ptr [[PRESULT:%.*]], align 1

0 commit comments

Comments
 (0)