Skip to content

Commit 5cdeaf8

Browse files
hassnaaHamdigithub-actions[bot]
authored andcommitted
Automerge: [LV][AArch64] Prefer epilogue with fixed-width over scalable VF. (#155546)
In case of equal costs Prefer epilogue with fixed-width over scalable VF. That is helpful in cases like post-LTO vectorization where epilogue with fixed-width VF can be removed when we eventually know that the trip count is less than the epilogue iterations.
2 parents 5eb3625 + 35b2276 commit 5cdeaf8

File tree

12 files changed

+492
-381
lines changed

12 files changed

+492
-381
lines changed

llvm/include/llvm/Analysis/TargetTransformInfo.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1812,10 +1812,11 @@ class TargetTransformInfo {
18121812
unsigned ChainSizeInBytes,
18131813
VectorType *VecTy) const;
18141814

1815-
/// \returns True if the targets prefers fixed width vectorization if the
1815+
/// \returns True if the target prefers fixed width vectorization if the
18161816
/// loop vectorizer's cost-model assigns an equal cost to the fixed and
18171817
/// scalable version of the vectorized loop.
1818-
LLVM_ABI bool preferFixedOverScalableIfEqualCost() const;
1818+
/// \p IsEpilogue is true if the decision is for the epilogue loop.
1819+
LLVM_ABI bool preferFixedOverScalableIfEqualCost(bool IsEpilogue) const;
18191820

18201821
/// \returns True if target prefers SLP vectorizer with altermate opcode
18211822
/// vectorization, false - otherwise.

llvm/include/llvm/Analysis/TargetTransformInfoImpl.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1092,7 +1092,9 @@ class TargetTransformInfoImplBase {
10921092
return VF;
10931093
}
10941094

1095-
virtual bool preferFixedOverScalableIfEqualCost() const { return false; }
1095+
virtual bool preferFixedOverScalableIfEqualCost(bool IsEpilogue) const {
1096+
return false;
1097+
}
10961098

10971099
virtual bool preferInLoopReduction(RecurKind Kind, Type *Ty) const {
10981100
return false;

llvm/lib/Analysis/TargetTransformInfo.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1403,8 +1403,9 @@ unsigned TargetTransformInfo::getStoreVectorFactor(unsigned VF,
14031403
return TTIImpl->getStoreVectorFactor(VF, StoreSize, ChainSizeInBytes, VecTy);
14041404
}
14051405

1406-
bool TargetTransformInfo::preferFixedOverScalableIfEqualCost() const {
1407-
return TTIImpl->preferFixedOverScalableIfEqualCost();
1406+
bool TargetTransformInfo::preferFixedOverScalableIfEqualCost(
1407+
bool IsEpilogue) const {
1408+
return TTIImpl->preferFixedOverScalableIfEqualCost(IsEpilogue);
14081409
}
14091410

14101411
bool TargetTransformInfo::preferInLoopReduction(RecurKind Kind,

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6022,9 +6022,15 @@ static bool containsDecreasingPointers(Loop *TheLoop,
60226022
return false;
60236023
}
60246024

6025-
bool AArch64TTIImpl::preferFixedOverScalableIfEqualCost() const {
6025+
bool AArch64TTIImpl::preferFixedOverScalableIfEqualCost(bool IsEpilogue) const {
60266026
if (SVEPreferFixedOverScalableIfEqualCost.getNumOccurrences())
60276027
return SVEPreferFixedOverScalableIfEqualCost;
6028+
// For cases like post-LTO vectorization, when we eventually know the trip
6029+
// count, epilogue with fixed-width vectorization can be deleted if the trip
6030+
// count is less than the epilogue iterations. That's why we prefer
6031+
// fixed-width vectorization in epilogue in case of equal costs.
6032+
if (IsEpilogue)
6033+
return true;
60286034
return ST->useFixedOverScalableIfEqualCost();
60296035
}
60306036

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -424,7 +424,7 @@ class AArch64TTIImpl final : public BasicTTIImplBase<AArch64TTIImpl> {
424424
return TailFoldingStyle::DataWithoutLaneMask;
425425
}
426426

427-
bool preferFixedOverScalableIfEqualCost() const override;
427+
bool preferFixedOverScalableIfEqualCost(bool IsEpilogue) const override;
428428

429429
unsigned getEpilogueVectorizationMinVF() const override;
430430

llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -622,13 +622,15 @@ class LoopVectorizationPlanner {
622622
/// Returns true if the per-lane cost of VectorizationFactor A is lower than
623623
/// that of B.
624624
bool isMoreProfitable(const VectorizationFactor &A,
625-
const VectorizationFactor &B, bool HasTail) const;
625+
const VectorizationFactor &B, bool HasTail,
626+
bool IsEpilogue = false) const;
626627

627628
/// Returns true if the per-lane cost of VectorizationFactor A is lower than
628629
/// that of B in the context of vectorizing a loop with known \p MaxTripCount.
629630
bool isMoreProfitable(const VectorizationFactor &A,
630631
const VectorizationFactor &B,
631-
const unsigned MaxTripCount, bool HasTail) const;
632+
const unsigned MaxTripCount, bool HasTail,
633+
bool IsEpilogue = false) const;
632634

633635
/// Determines if we have the infrastructure to vectorize the loop and its
634636
/// epilogue, assuming the main loop is vectorized by \p VF.

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3856,7 +3856,8 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
38563856
bool LoopVectorizationPlanner::isMoreProfitable(const VectorizationFactor &A,
38573857
const VectorizationFactor &B,
38583858
const unsigned MaxTripCount,
3859-
bool HasTail) const {
3859+
bool HasTail,
3860+
bool IsEpilogue) const {
38603861
InstructionCost CostA = A.Cost;
38613862
InstructionCost CostB = B.Cost;
38623863

@@ -3880,7 +3881,7 @@ bool LoopVectorizationPlanner::isMoreProfitable(const VectorizationFactor &A,
38803881
// Assume vscale may be larger than 1 (or the value being tuned for),
38813882
// so that scalable vectorization is slightly favorable over fixed-width
38823883
// vectorization.
3883-
bool PreferScalable = !TTI.preferFixedOverScalableIfEqualCost() &&
3884+
bool PreferScalable = !TTI.preferFixedOverScalableIfEqualCost(IsEpilogue) &&
38843885
A.Width.isScalable() && !B.Width.isScalable();
38853886

38863887
auto CmpFn = [PreferScalable](const InstructionCost &LHS,
@@ -3918,10 +3919,11 @@ bool LoopVectorizationPlanner::isMoreProfitable(const VectorizationFactor &A,
39183919

39193920
bool LoopVectorizationPlanner::isMoreProfitable(const VectorizationFactor &A,
39203921
const VectorizationFactor &B,
3921-
bool HasTail) const {
3922+
bool HasTail,
3923+
bool IsEpilogue) const {
39223924
const unsigned MaxTripCount = PSE.getSmallConstantMaxTripCount();
3923-
return LoopVectorizationPlanner::isMoreProfitable(A, B, MaxTripCount,
3924-
HasTail);
3925+
return LoopVectorizationPlanner::isMoreProfitable(A, B, MaxTripCount, HasTail,
3926+
IsEpilogue);
39253927
}
39263928

39273929
void LoopVectorizationPlanner::emitInvalidCostRemarks(
@@ -4454,7 +4456,8 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
44544456
}
44554457

44564458
if (Result.Width.isScalar() ||
4457-
isMoreProfitable(NextVF, Result, MaxTripCount, !CM.foldTailByMasking()))
4459+
isMoreProfitable(NextVF, Result, MaxTripCount, !CM.foldTailByMasking(),
4460+
/*IsEpilogue*/ true))
44584461
Result = NextVF;
44594462
}
44604463

llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll

Lines changed: 20 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,9 @@ target triple = "aarch64-unknown-linux-gnu"
1212
; DEBUG: LV: Found maximum trip count: 19
1313
; DEBUG: LV: IC is 1
1414
; DEBUG-VS1: LV: VF is vscale x 16
15-
; DEBUG-VS1: Main Loop VF:vscale x 16, Main Loop UF:1, Epilogue Loop VF:vscale x 8, Epilogue Loop UF:1
15+
; DEBUG-VS1: Main Loop VF:vscale x 16, Main Loop UF:1, Epilogue Loop VF:8, Epilogue Loop UF:1
1616
; DEBUG-VS2: LV: VF is vscale x 8
17-
; DEBUG-VS2: Main Loop VF:vscale x 8, Main Loop UF:1, Epilogue Loop VF:vscale x 4, Epilogue Loop UF:1
17+
; DEBUG-VS2: Main Loop VF:vscale x 8, Main Loop UF:1, Epilogue Loop VF:8, Epilogue Loop UF:1
1818

1919
; DEBUG-LABEL: LV: Checking a loop in 'trip_count_too_small'
2020
; DEBUG: LV: Found a loop with a very small trip count. This loop is worth vectorizing only if no scalar iteration overheads are incurred.
@@ -48,9 +48,7 @@ define void @low_vf_ic_is_better(ptr nocapture noundef %p, i32 %tc, i16 noundef
4848
; CHECK-VS1-NEXT: [[TMP1:%.*]] = add i32 [[TC]], 1
4949
; CHECK-VS1-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
5050
; CHECK-VS1-NEXT: [[TMP3:%.*]] = sub i64 20, [[TMP2]]
51-
; CHECK-VS1-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
52-
; CHECK-VS1-NEXT: [[TMP5:%.*]] = shl nuw i64 [[TMP4]], 3
53-
; CHECK-VS1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], [[TMP5]]
51+
; CHECK-VS1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], 8
5452
; CHECK-VS1-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]]
5553
; CHECK-VS1: [[VECTOR_SCEVCHECK]]:
5654
; CHECK-VS1-NEXT: [[TMP6:%.*]] = add i32 [[TC]], 1
@@ -91,28 +89,24 @@ define void @low_vf_ic_is_better(ptr nocapture noundef %p, i32 %tc, i16 noundef
9189
; CHECK-VS1: [[VEC_EPILOG_ITER_CHECK]]:
9290
; CHECK-VS1-NEXT: [[IND_END4:%.*]] = add i64 [[TMP0]], [[N_VEC]]
9391
; CHECK-VS1-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP3]], [[N_VEC]]
94-
; CHECK-VS1-NEXT: [[TMP26:%.*]] = call i64 @llvm.vscale.i64()
95-
; CHECK-VS1-NEXT: [[TMP27:%.*]] = shl nuw i64 [[TMP26]], 3
96-
; CHECK-VS1-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], [[TMP27]]
92+
; CHECK-VS1-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8
9793
; CHECK-VS1-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]]
9894
; CHECK-VS1: [[VEC_EPILOG_PH]]:
9995
; CHECK-VS1-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
100-
; CHECK-VS1-NEXT: [[TMP28:%.*]] = call i64 @llvm.vscale.i64()
101-
; CHECK-VS1-NEXT: [[TMP29:%.*]] = mul nuw i64 [[TMP28]], 8
102-
; CHECK-VS1-NEXT: [[N_MOD_VF2:%.*]] = urem i64 [[TMP3]], [[TMP29]]
96+
; CHECK-VS1-NEXT: [[N_MOD_VF2:%.*]] = urem i64 [[TMP3]], 8
10397
; CHECK-VS1-NEXT: [[N_VEC3:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF2]]
10498
; CHECK-VS1-NEXT: [[TMP39:%.*]] = add i64 [[TMP0]], [[N_VEC3]]
105-
; CHECK-VS1-NEXT: [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <vscale x 8 x i8> poison, i8 [[CONV]], i64 0
106-
; CHECK-VS1-NEXT: [[BROADCAST_SPLAT8:%.*]] = shufflevector <vscale x 8 x i8> [[BROADCAST_SPLATINSERT7]], <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
99+
; CHECK-VS1-NEXT: [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <8 x i8> poison, i8 [[CONV]], i64 0
100+
; CHECK-VS1-NEXT: [[BROADCAST_SPLAT5:%.*]] = shufflevector <8 x i8> [[BROADCAST_SPLATINSERT4]], <8 x i8> poison, <8 x i32> zeroinitializer
107101
; CHECK-VS1-NEXT: br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
108102
; CHECK-VS1: [[VEC_EPILOG_VECTOR_BODY]]:
109103
; CHECK-VS1-NEXT: [[INDEX5:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT9:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
110104
; CHECK-VS1-NEXT: [[OFFSET_IDX:%.*]] = add i64 [[TMP0]], [[INDEX5]]
111105
; CHECK-VS1-NEXT: [[TMP33:%.*]] = getelementptr inbounds nuw i8, ptr [[V]], i64 [[OFFSET_IDX]]
112-
; CHECK-VS1-NEXT: [[WIDE_LOAD6:%.*]] = load <vscale x 8 x i8>, ptr [[TMP33]], align 1
113-
; CHECK-VS1-NEXT: [[TMP35:%.*]] = add <vscale x 8 x i8> [[WIDE_LOAD6]], [[BROADCAST_SPLAT8]]
114-
; CHECK-VS1-NEXT: store <vscale x 8 x i8> [[TMP35]], ptr [[TMP33]], align 1
115-
; CHECK-VS1-NEXT: [[INDEX_NEXT9]] = add nuw i64 [[INDEX5]], [[TMP29]]
106+
; CHECK-VS1-NEXT: [[WIDE_LOAD7:%.*]] = load <8 x i8>, ptr [[TMP33]], align 1
107+
; CHECK-VS1-NEXT: [[TMP23:%.*]] = add <8 x i8> [[WIDE_LOAD7]], [[BROADCAST_SPLAT5]]
108+
; CHECK-VS1-NEXT: store <8 x i8> [[TMP23]], ptr [[TMP33]], align 1
109+
; CHECK-VS1-NEXT: [[INDEX_NEXT9]] = add nuw i64 [[INDEX5]], 8
116110
; CHECK-VS1-NEXT: [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT9]], [[N_VEC3]]
117111
; CHECK-VS1-NEXT: br i1 [[TMP36]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
118112
; CHECK-VS1: [[VEC_EPILOG_MIDDLE_BLOCK]]:
@@ -148,9 +142,7 @@ define void @low_vf_ic_is_better(ptr nocapture noundef %p, i32 %tc, i16 noundef
148142
; CHECK-VS2-NEXT: [[TMP1:%.*]] = add i32 [[TC]], 1
149143
; CHECK-VS2-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
150144
; CHECK-VS2-NEXT: [[TMP3:%.*]] = sub i64 20, [[TMP2]]
151-
; CHECK-VS2-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
152-
; CHECK-VS2-NEXT: [[TMP5:%.*]] = shl nuw i64 [[TMP4]], 2
153-
; CHECK-VS2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], [[TMP5]]
145+
; CHECK-VS2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], 8
154146
; CHECK-VS2-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]]
155147
; CHECK-VS2: [[VECTOR_SCEVCHECK]]:
156148
; CHECK-VS2-NEXT: [[TMP6:%.*]] = add i32 [[TC]], 1
@@ -191,28 +183,24 @@ define void @low_vf_ic_is_better(ptr nocapture noundef %p, i32 %tc, i16 noundef
191183
; CHECK-VS2: [[VEC_EPILOG_ITER_CHECK]]:
192184
; CHECK-VS2-NEXT: [[IND_END4:%.*]] = add i64 [[TMP0]], [[N_VEC]]
193185
; CHECK-VS2-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP3]], [[N_VEC]]
194-
; CHECK-VS2-NEXT: [[TMP26:%.*]] = call i64 @llvm.vscale.i64()
195-
; CHECK-VS2-NEXT: [[TMP27:%.*]] = shl nuw i64 [[TMP26]], 2
196-
; CHECK-VS2-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], [[TMP27]]
186+
; CHECK-VS2-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8
197187
; CHECK-VS2-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]]
198188
; CHECK-VS2: [[VEC_EPILOG_PH]]:
199189
; CHECK-VS2-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
200-
; CHECK-VS2-NEXT: [[TMP28:%.*]] = call i64 @llvm.vscale.i64()
201-
; CHECK-VS2-NEXT: [[TMP29:%.*]] = mul nuw i64 [[TMP28]], 4
202-
; CHECK-VS2-NEXT: [[N_MOD_VF2:%.*]] = urem i64 [[TMP3]], [[TMP29]]
190+
; CHECK-VS2-NEXT: [[N_MOD_VF2:%.*]] = urem i64 [[TMP3]], 8
203191
; CHECK-VS2-NEXT: [[N_VEC3:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF2]]
204192
; CHECK-VS2-NEXT: [[TMP39:%.*]] = add i64 [[TMP0]], [[N_VEC3]]
205-
; CHECK-VS2-NEXT: [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <vscale x 4 x i8> poison, i8 [[CONV]], i64 0
206-
; CHECK-VS2-NEXT: [[BROADCAST_SPLAT8:%.*]] = shufflevector <vscale x 4 x i8> [[BROADCAST_SPLATINSERT7]], <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer
193+
; CHECK-VS2-NEXT: [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <8 x i8> poison, i8 [[CONV]], i64 0
194+
; CHECK-VS2-NEXT: [[BROADCAST_SPLAT5:%.*]] = shufflevector <8 x i8> [[BROADCAST_SPLATINSERT4]], <8 x i8> poison, <8 x i32> zeroinitializer
207195
; CHECK-VS2-NEXT: br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
208196
; CHECK-VS2: [[VEC_EPILOG_VECTOR_BODY]]:
209197
; CHECK-VS2-NEXT: [[INDEX5:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT9:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
210198
; CHECK-VS2-NEXT: [[OFFSET_IDX:%.*]] = add i64 [[TMP0]], [[INDEX5]]
211199
; CHECK-VS2-NEXT: [[TMP33:%.*]] = getelementptr inbounds nuw i8, ptr [[V]], i64 [[OFFSET_IDX]]
212-
; CHECK-VS2-NEXT: [[WIDE_LOAD6:%.*]] = load <vscale x 4 x i8>, ptr [[TMP33]], align 1
213-
; CHECK-VS2-NEXT: [[TMP35:%.*]] = add <vscale x 4 x i8> [[WIDE_LOAD6]], [[BROADCAST_SPLAT8]]
214-
; CHECK-VS2-NEXT: store <vscale x 4 x i8> [[TMP35]], ptr [[TMP33]], align 1
215-
; CHECK-VS2-NEXT: [[INDEX_NEXT9]] = add nuw i64 [[INDEX5]], [[TMP29]]
200+
; CHECK-VS2-NEXT: [[WIDE_LOAD7:%.*]] = load <8 x i8>, ptr [[TMP33]], align 1
201+
; CHECK-VS2-NEXT: [[TMP23:%.*]] = add <8 x i8> [[WIDE_LOAD7]], [[BROADCAST_SPLAT5]]
202+
; CHECK-VS2-NEXT: store <8 x i8> [[TMP23]], ptr [[TMP33]], align 1
203+
; CHECK-VS2-NEXT: [[INDEX_NEXT9]] = add nuw i64 [[INDEX5]], 8
216204
; CHECK-VS2-NEXT: [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT9]], [[N_VEC3]]
217205
; CHECK-VS2-NEXT: br i1 [[TMP36]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
218206
; CHECK-VS2: [[VEC_EPILOG_MIDDLE_BLOCK]]:

llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll

Lines changed: 8 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -9,10 +9,7 @@ define void @cost_store_i8(ptr %dst) #0 {
99
; DEFAULT-LABEL: define void @cost_store_i8(
1010
; DEFAULT-SAME: ptr [[DST:%.*]]) #[[ATTR0:[0-9]+]] {
1111
; DEFAULT-NEXT: iter.check:
12-
; DEFAULT-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
13-
; DEFAULT-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 3
14-
; DEFAULT-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 101, [[TMP1]]
15-
; DEFAULT-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
12+
; DEFAULT-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
1613
; DEFAULT: vector.main.loop.iter.check:
1714
; DEFAULT-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
1815
; DEFAULT-NEXT: [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 5
@@ -40,29 +37,22 @@ define void @cost_store_i8(ptr %dst) #0 {
4037
; DEFAULT-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
4138
; DEFAULT: vec.epilog.iter.check:
4239
; DEFAULT-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 101, [[N_VEC]]
43-
; DEFAULT-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
44-
; DEFAULT-NEXT: [[TMP13:%.*]] = shl nuw i64 [[TMP12]], 3
45-
; DEFAULT-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], [[TMP13]]
40+
; DEFAULT-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8
4641
; DEFAULT-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]]
4742
; DEFAULT: vec.epilog.ph:
4843
; DEFAULT-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
49-
; DEFAULT-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
50-
; DEFAULT-NEXT: [[TMP15:%.*]] = mul nuw i64 [[TMP14]], 8
51-
; DEFAULT-NEXT: [[N_MOD_VF2:%.*]] = urem i64 101, [[TMP15]]
52-
; DEFAULT-NEXT: [[N_VEC3:%.*]] = sub i64 101, [[N_MOD_VF2]]
5344
; DEFAULT-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
5445
; DEFAULT: vec.epilog.vector.body:
5546
; DEFAULT-NEXT: [[INDEX5:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT6:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
5647
; DEFAULT-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INDEX5]]
57-
; DEFAULT-NEXT: store <vscale x 8 x i8> zeroinitializer, ptr [[TMP19]], align 1
58-
; DEFAULT-NEXT: [[INDEX_NEXT6]] = add nuw i64 [[INDEX5]], [[TMP15]]
59-
; DEFAULT-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT6]], [[N_VEC3]]
60-
; DEFAULT-NEXT: br i1 [[TMP21]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
48+
; DEFAULT-NEXT: store <8 x i8> zeroinitializer, ptr [[TMP19]], align 1
49+
; DEFAULT-NEXT: [[INDEX_NEXT6]] = add nuw i64 [[INDEX5]], 8
50+
; DEFAULT-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT6]], 96
51+
; DEFAULT-NEXT: br i1 [[TMP10]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
6152
; DEFAULT: vec.epilog.middle.block:
62-
; DEFAULT-NEXT: [[CMP_N4:%.*]] = icmp eq i64 101, [[N_VEC3]]
63-
; DEFAULT-NEXT: br i1 [[CMP_N4]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]]
53+
; DEFAULT-NEXT: br i1 false, label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]]
6454
; DEFAULT: vec.epilog.scalar.ph:
65-
; DEFAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC3]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
55+
; DEFAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 96, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
6656
; DEFAULT-NEXT: br label [[LOOP:%.*]]
6757
; DEFAULT: loop:
6858
; DEFAULT-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]

llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-vscale-tune.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ target triple = "aarch64-unknown-linux-gnu"
1010
define void @foo(ptr noalias nocapture readonly %p, ptr noalias nocapture %q, i64 %len) #0 {
1111
; CHECK-EPILOG: vec.epilog.ph:
1212
; CHECK-EPILOG: vec.epilog.vector.body:
13-
; CHECK-EPILOG: load <vscale x 4 x i16>
13+
; CHECK-EPILOG: load <8 x i16>
1414

1515
; The epilogue loop gets vectorised vscale x 2 x i16 wide.
1616
; CHECK-EPILOG-V2: vec.epilog.ph:

0 commit comments

Comments
 (0)