Skip to content

Commit e6b508f

Browse files
sdesmalen-armllvmbot
authored andcommitted
[LV] Use VScaleForTuning to allow wider epilogue VFs.
When the main loop is e.g. VF=vscale x 1 and the epilogue VF cannot be any smaller, the vectorizer should try to estimate how many lanes are executed at runtime and allow a suitable fixed-width VF to be chosen. It can use VScaleForTuning to figure out what a suitable fixed-width VF could be. For the case where the main loop VF is VF=vscale x 1, and VScaleForTuning=8, it could still choose an epilogue VF upto VF=4. This was a bit tricky to test, so this patch also introduces a wrapper function to get 'VScaleForTuning' by also considering vscale_range. If min and max are equal, then that will be the vscale we compile for. It makes little sense to tune for a different width if the code will not be portable for other widths. Reviewed By: david-arm Differential Revision: https://reviews.llvm.org/D118709 (cherry picked from commit eaee477)
1 parent 319f4b2 commit e6b508f

File tree

2 files changed

+218
-10
lines changed

2 files changed

+218
-10
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 32 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1701,6 +1701,11 @@ class LoopVectorizationCostModel {
17011701
private:
17021702
unsigned NumPredStores = 0;
17031703

1704+
/// Convenience function that returns the value of vscale_range iff
1705+
/// vscale_range.min == vscale_range.max or otherwise returns the value
1706+
/// returned by the corresponding TLI method.
1707+
Optional<unsigned> getVScaleForTuning() const;
1708+
17041709
/// \return An upper bound for the vectorization factors for both
17051710
/// fixed and scalable vectorization, where the minimum-known number of
17061711
/// elements is a power-of-2 larger than zero. If scalable vectorization is
@@ -5600,6 +5605,18 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
56005605
return MaxVF;
56015606
}
56025607

5608+
Optional<unsigned> LoopVectorizationCostModel::getVScaleForTuning() const {
5609+
if (TheFunction->hasFnAttribute(Attribute::VScaleRange)) {
5610+
auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange);
5611+
auto Min = Attr.getVScaleRangeMin();
5612+
auto Max = Attr.getVScaleRangeMax();
5613+
if (Max && Min == Max)
5614+
return Max;
5615+
}
5616+
5617+
return TTI.getVScaleForTuning();
5618+
}
5619+
56035620
bool LoopVectorizationCostModel::isMoreProfitable(
56045621
const VectorizationFactor &A, const VectorizationFactor &B) const {
56055622
InstructionCost CostA = A.Cost;
@@ -5624,7 +5641,7 @@ bool LoopVectorizationCostModel::isMoreProfitable(
56245641
// Improve estimate for the vector width if it is scalable.
56255642
unsigned EstimatedWidthA = A.Width.getKnownMinValue();
56265643
unsigned EstimatedWidthB = B.Width.getKnownMinValue();
5627-
if (Optional<unsigned> VScale = TTI.getVScaleForTuning()) {
5644+
if (Optional<unsigned> VScale = getVScaleForTuning()) {
56285645
if (A.Width.isScalable())
56295646
EstimatedWidthA *= VScale.getValue();
56305647
if (B.Width.isScalable())
@@ -5673,7 +5690,7 @@ VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor(
56735690

56745691
#ifndef NDEBUG
56755692
unsigned AssumedMinimumVscale = 1;
5676-
if (Optional<unsigned> VScale = TTI.getVScaleForTuning())
5693+
if (Optional<unsigned> VScale = getVScaleForTuning())
56775694
AssumedMinimumVscale = VScale.getValue();
56785695
unsigned Width =
56795696
Candidate.Width.isScalable()
@@ -5885,8 +5902,20 @@ LoopVectorizationCostModel::selectEpilogueVectorizationFactor(
58855902
return Result;
58865903
}
58875904

5905+
// If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know
5906+
// the main loop handles 8 lanes per iteration. We could still benefit from
5907+
// vectorizing the epilogue loop with VF=4.
5908+
ElementCount EstimatedRuntimeVF = MainLoopVF;
5909+
if (MainLoopVF.isScalable()) {
5910+
EstimatedRuntimeVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue());
5911+
if (Optional<unsigned> VScale = getVScaleForTuning())
5912+
EstimatedRuntimeVF *= VScale.getValue();
5913+
}
5914+
58885915
for (auto &NextVF : ProfitableVFs)
5889-
if (ElementCount::isKnownLT(NextVF.Width, MainLoopVF) &&
5916+
if (((!NextVF.Width.isScalable() && MainLoopVF.isScalable() &&
5917+
ElementCount::isKnownLT(NextVF.Width, EstimatedRuntimeVF)) ||
5918+
ElementCount::isKnownLT(NextVF.Width, MainLoopVF)) &&
58905919
(Result.Width.isScalar() || isMoreProfitable(NextVF, Result)) &&
58915920
LVP.hasPlanWithVF(NextVF.Width))
58925921
Result = NextVF;

llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll

Lines changed: 186 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,24 +2,22 @@
22
; REQUIRES: asserts
33
; RUN: opt < %s -loop-vectorize -force-vector-interleave=2 -epilogue-vectorization-minimum-VF=0 --debug-only=loop-vectorize -force-target-instruction-cost=1 -S 2>%t | FileCheck %s --check-prefix=CHECK
44
; RUN: cat %t | FileCheck %s --check-prefix=DEBUG
5-
; RUN: opt < %s -loop-vectorize -force-vector-interleave=2 -epilogue-vectorization-minimum-VF=8 --debug-only=loop-vectorize -S 2>%t | FileCheck %s --check-prefix=CHECK
6-
; RUN: cat %t | FileCheck %s --check-prefix=DEBUG
75
; RUN: opt < %s -loop-vectorize -force-vector-interleave=2 -epilogue-vectorization-force-VF=8 --debug-only=loop-vectorize -S 2>%t | FileCheck %s --check-prefix=CHECK-VF8
86
; RUN: cat %t | FileCheck %s --check-prefix=DEBUG-FORCED
97

108
target triple = "aarch64-linux-gnu"
119

12-
; DEBUG: LV: Checking a loop in "f1"
10+
; DEBUG: LV: Checking a loop in "main_vf_vscale_x_16"
1311
; DEBUG: Create Skeleton for epilogue vectorized loop (first pass)
1412
; DEBUG: Main Loop VF:vscale x 16, Main Loop UF:2, Epilogue Loop VF:vscale x 8, Epilogue Loop UF:1
1513

16-
; DEBUG-FORCED: LV: Checking a loop in "f1"
14+
; DEBUG-FORCED: LV: Checking a loop in "main_vf_vscale_x_16"
1715
; DEBUG-FORCED: LEV: Epilogue vectorization factor is forced.
1816
; DEBUG-FORCED: Create Skeleton for epilogue vectorized loop (first pass)
1917
; DEBUG-FORCED: Main Loop VF:vscale x 16, Main Loop UF:2, Epilogue Loop VF:8, Epilogue Loop UF:1
2018

21-
define void @f1(i8* %A) #0 {
22-
; CHECK-LABEL: @f1(
19+
define void @main_vf_vscale_x_16(i8* %A) #0 {
20+
; CHECK-LABEL: @main_vf_vscale_x_16(
2321
; CHECK-NEXT: iter.check:
2422
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
2523
; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8
@@ -105,7 +103,7 @@ define void @f1(i8* %A) #0 {
105103
; CHECK: exit:
106104
; CHECK-NEXT: ret void
107105
;
108-
; CHECK-VF8-LABEL: @f1(
106+
; CHECK-VF8-LABEL: @main_vf_vscale_x_16(
109107
; CHECK-VF8-NEXT: iter.check:
110108
; CHECK-VF8-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
111109
; CHECK-VF8: vector.main.loop.iter.check:
@@ -195,4 +193,185 @@ exit:
195193
ret void
196194
}
197195

196+
197+
; DEBUG: LV: Checking a loop in "main_vf_vscale_x_2"
198+
; DEBUG: Create Skeleton for epilogue vectorized loop (first pass)
199+
; DEBUG: Main Loop VF:vscale x 2, Main Loop UF:2, Epilogue Loop VF:8, Epilogue Loop UF:1
200+
201+
; DEBUG-FORCED: LV: Checking a loop in "main_vf_vscale_x_2"
202+
; DEBUG-FORCED: LEV: Epilogue vectorization factor is forced.
203+
; DEBUG-FORCED: Create Skeleton for epilogue vectorized loop (first pass)
204+
; DEBUG-FORCED: Main Loop VF:vscale x 2, Main Loop UF:2, Epilogue Loop VF:8, Epilogue Loop UF:1
205+
206+
; When the vector.body uses VF=vscale x 1 (or VF=vscale x 2 because
207+
; that's the minimum supported VF by SVE), we could still use a wide
208+
; fixed-width VF=8 for the epilogue if the vectors are known to be
209+
; sufficiently wide. This information can be deduced from vscale_range or
210+
; VScaleForTuning (set by mcpu/mtune).
211+
define void @main_vf_vscale_x_2(i64* %A) #0 vscale_range(8, 8) {
212+
; CHECK-LABEL: @main_vf_vscale_x_2(
213+
; CHECK-NEXT: iter.check:
214+
; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
215+
; CHECK: vector.main.loop.iter.check:
216+
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
217+
; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
218+
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
219+
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
220+
; CHECK: vector.ph:
221+
; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
222+
; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
223+
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
224+
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
225+
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
226+
; CHECK: vector.body:
227+
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
228+
; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0
229+
; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
230+
; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 2
231+
; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[TMP6]], 0
232+
; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 1
233+
; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]]
234+
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, i64* [[A:%.*]], i64 [[TMP4]]
235+
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[TMP9]]
236+
; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, i64* [[TMP10]], i32 0
237+
; CHECK-NEXT: [[TMP13:%.*]] = bitcast i64* [[TMP12]] to <vscale x 2 x i64>*
238+
; CHECK-NEXT: store <vscale x 2 x i64> shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i32 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i64>* [[TMP13]], align 1
239+
; CHECK-NEXT: [[TMP14:%.*]] = call i32 @llvm.vscale.i32()
240+
; CHECK-NEXT: [[TMP15:%.*]] = mul i32 [[TMP14]], 2
241+
; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i64, i64* [[TMP10]], i32 [[TMP15]]
242+
; CHECK-NEXT: [[TMP17:%.*]] = bitcast i64* [[TMP16]] to <vscale x 2 x i64>*
243+
; CHECK-NEXT: store <vscale x 2 x i64> shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i32 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i64>* [[TMP17]], align 1
244+
; CHECK-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
245+
; CHECK-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 4
246+
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP19]]
247+
; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
248+
; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
249+
; CHECK: middle.block:
250+
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
251+
; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
252+
; CHECK: vec.epilog.iter.check:
253+
; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 1024, [[N_VEC]]
254+
; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8
255+
; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
256+
; CHECK: vec.epilog.ph:
257+
; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
258+
; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
259+
; CHECK: vec.epilog.vector.body:
260+
; CHECK-NEXT: [[INDEX2:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
261+
; CHECK-NEXT: [[TMP21:%.*]] = add i64 [[INDEX2]], 0
262+
; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[TMP21]]
263+
; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i64, i64* [[TMP22]], i32 0
264+
; CHECK-NEXT: [[TMP24:%.*]] = bitcast i64* [[TMP23]] to <8 x i64>*
265+
; CHECK-NEXT: store <8 x i64> <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>, <8 x i64>* [[TMP24]], align 1
266+
; CHECK-NEXT: [[INDEX_NEXT3]] = add nuw i64 [[INDEX2]], 8
267+
; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT3]], 1024
268+
; CHECK-NEXT: br i1 [[TMP25]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
269+
; CHECK: vec.epilog.middle.block:
270+
; CHECK-NEXT: [[CMP_N1:%.*]] = icmp eq i64 1024, 1024
271+
; CHECK-NEXT: br i1 [[CMP_N1]], label [[EXIT_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
272+
; CHECK: vec.epilog.scalar.ph:
273+
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
274+
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
275+
; CHECK: for.body:
276+
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
277+
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[IV]]
278+
; CHECK-NEXT: store i64 1, i64* [[ARRAYIDX]], align 1
279+
; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
280+
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[IV_NEXT]], 1024
281+
; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY]], label [[EXIT_LOOPEXIT]], !llvm.loop [[LOOP7:![0-9]+]]
282+
; CHECK: exit.loopexit:
283+
; CHECK-NEXT: br label [[EXIT]]
284+
; CHECK: exit:
285+
; CHECK-NEXT: ret void
286+
;
287+
; CHECK-VF8-LABEL: @main_vf_vscale_x_2(
288+
; CHECK-VF8-NEXT: iter.check:
289+
; CHECK-VF8-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
290+
; CHECK-VF8: vector.main.loop.iter.check:
291+
; CHECK-VF8-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
292+
; CHECK-VF8-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
293+
; CHECK-VF8-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
294+
; CHECK-VF8-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
295+
; CHECK-VF8: vector.ph:
296+
; CHECK-VF8-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
297+
; CHECK-VF8-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
298+
; CHECK-VF8-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
299+
; CHECK-VF8-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
300+
; CHECK-VF8-NEXT: br label [[VECTOR_BODY:%.*]]
301+
; CHECK-VF8: vector.body:
302+
; CHECK-VF8-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
303+
; CHECK-VF8-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0
304+
; CHECK-VF8-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
305+
; CHECK-VF8-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 2
306+
; CHECK-VF8-NEXT: [[TMP7:%.*]] = add i64 [[TMP6]], 0
307+
; CHECK-VF8-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 1
308+
; CHECK-VF8-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]]
309+
; CHECK-VF8-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, i64* [[A:%.*]], i64 [[TMP4]]
310+
; CHECK-VF8-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[TMP9]]
311+
; CHECK-VF8-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, i64* [[TMP10]], i32 0
312+
; CHECK-VF8-NEXT: [[TMP13:%.*]] = bitcast i64* [[TMP12]] to <vscale x 2 x i64>*
313+
; CHECK-VF8-NEXT: store <vscale x 2 x i64> shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i32 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i64>* [[TMP13]], align 1
314+
; CHECK-VF8-NEXT: [[TMP14:%.*]] = call i32 @llvm.vscale.i32()
315+
; CHECK-VF8-NEXT: [[TMP15:%.*]] = mul i32 [[TMP14]], 2
316+
; CHECK-VF8-NEXT: [[TMP16:%.*]] = getelementptr inbounds i64, i64* [[TMP10]], i32 [[TMP15]]
317+
; CHECK-VF8-NEXT: [[TMP17:%.*]] = bitcast i64* [[TMP16]] to <vscale x 2 x i64>*
318+
; CHECK-VF8-NEXT: store <vscale x 2 x i64> shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i32 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i64>* [[TMP17]], align 1
319+
; CHECK-VF8-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
320+
; CHECK-VF8-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 4
321+
; CHECK-VF8-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP19]]
322+
; CHECK-VF8-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
323+
; CHECK-VF8-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
324+
; CHECK-VF8: middle.block:
325+
; CHECK-VF8-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
326+
; CHECK-VF8-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
327+
; CHECK-VF8: vec.epilog.iter.check:
328+
; CHECK-VF8-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 1024, [[N_VEC]]
329+
; CHECK-VF8-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8
330+
; CHECK-VF8-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
331+
; CHECK-VF8: vec.epilog.ph:
332+
; CHECK-VF8-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
333+
; CHECK-VF8-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
334+
; CHECK-VF8: vec.epilog.vector.body:
335+
; CHECK-VF8-NEXT: [[INDEX2:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
336+
; CHECK-VF8-NEXT: [[TMP21:%.*]] = add i64 [[INDEX2]], 0
337+
; CHECK-VF8-NEXT: [[TMP22:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[TMP21]]
338+
; CHECK-VF8-NEXT: [[TMP23:%.*]] = getelementptr inbounds i64, i64* [[TMP22]], i32 0
339+
; CHECK-VF8-NEXT: [[TMP24:%.*]] = bitcast i64* [[TMP23]] to <8 x i64>*
340+
; CHECK-VF8-NEXT: store <8 x i64> <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>, <8 x i64>* [[TMP24]], align 1
341+
; CHECK-VF8-NEXT: [[INDEX_NEXT3]] = add nuw i64 [[INDEX2]], 8
342+
; CHECK-VF8-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT3]], 1024
343+
; CHECK-VF8-NEXT: br i1 [[TMP25]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
344+
; CHECK-VF8: vec.epilog.middle.block:
345+
; CHECK-VF8-NEXT: [[CMP_N1:%.*]] = icmp eq i64 1024, 1024
346+
; CHECK-VF8-NEXT: br i1 [[CMP_N1]], label [[EXIT_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
347+
; CHECK-VF8: vec.epilog.scalar.ph:
348+
; CHECK-VF8-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
349+
; CHECK-VF8-NEXT: br label [[FOR_BODY:%.*]]
350+
; CHECK-VF8: for.body:
351+
; CHECK-VF8-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
352+
; CHECK-VF8-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[IV]]
353+
; CHECK-VF8-NEXT: store i64 1, i64* [[ARRAYIDX]], align 1
354+
; CHECK-VF8-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
355+
; CHECK-VF8-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[IV_NEXT]], 1024
356+
; CHECK-VF8-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY]], label [[EXIT_LOOPEXIT]], !llvm.loop [[LOOP7:![0-9]+]]
357+
; CHECK-VF8: exit.loopexit:
358+
; CHECK-VF8-NEXT: br label [[EXIT]]
359+
; CHECK-VF8: exit:
360+
; CHECK-VF8-NEXT: ret void
361+
;
362+
entry:
363+
br label %for.body
364+
365+
for.body:
366+
%iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
367+
%arrayidx = getelementptr inbounds i64, i64* %A, i64 %iv
368+
store i64 1, i64* %arrayidx, align 1
369+
%iv.next = add nuw nsw i64 %iv, 1
370+
%exitcond = icmp ne i64 %iv.next, 1024
371+
br i1 %exitcond, label %for.body, label %exit
372+
373+
exit:
374+
ret void
375+
}
376+
198377
attributes #0 = { "target-features"="+sve" }

0 commit comments

Comments
 (0)