Skip to content

Commit 1553324

Browse files
committed
a 2nd test for autovectorized loop with static tripcount
1 parent 7ff7ce3 commit 1553324

File tree

1 file changed

+137
-27
lines changed
  • llvm/test/Transforms/LoopUnroll/AArch64

1 file changed

+137
-27
lines changed

llvm/test/Transforms/LoopUnroll/AArch64/vector.ll

Lines changed: 137 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -4,24 +4,21 @@
44
define void @reverse(ptr %dst, ptr %src, i64 %len) {
55
; APPLE-LABEL: define void @reverse(
66
; APPLE-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], i64 [[LEN:%.*]]) #[[ATTR0:[0-9]+]] {
7-
; APPLE-NEXT: [[ENTRY:.*:]]
8-
; APPLE-NEXT: [[CMP7:%.*]] = icmp sgt i64 [[LEN]], 0
9-
; APPLE-NEXT: br i1 [[CMP7]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]]
10-
; APPLE: [[FOR_BODY_PREHEADER]]:
7+
; APPLE-NEXT: [[FOR_BODY_PREHEADER:.*]]:
118
; APPLE-NEXT: [[TMP5:%.*]] = add i64 [[LEN]], -1
129
; APPLE-NEXT: [[XTRAITER:%.*]] = and i64 [[LEN]], 7
1310
; APPLE-NEXT: [[TMP6:%.*]] = icmp ult i64 [[TMP5]], 7
14-
; APPLE-NEXT: br i1 [[TMP6]], label %[[FOR_COND_CLEANUP_LOOPEXIT_UNR_LCSSA:.*]], label %[[FOR_BODY_PREHEADER_NEW:.*]]
11+
; APPLE-NEXT: br i1 [[TMP6]], label %[[FOR_COND_CLEANUP_UNR_LCSSA:.*]], label %[[FOR_BODY_PREHEADER_NEW:.*]]
1512
; APPLE: [[FOR_BODY_PREHEADER_NEW]]:
1613
; APPLE-NEXT: [[UNROLL_ITER:%.*]] = sub i64 [[LEN]], [[XTRAITER]]
1714
; APPLE-NEXT: br label %[[FOR_BODY:.*]]
18-
; APPLE: [[FOR_COND_CLEANUP_LOOPEXIT_UNR_LCSSA_LOOPEXIT:.*]]:
15+
; APPLE: [[FOR_COND_CLEANUP_UNR_LCSSA_LOOPEXIT:.*]]:
1916
; APPLE-NEXT: [[IV_UNR_PH:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_7:%.*]], %[[FOR_BODY]] ]
20-
; APPLE-NEXT: br label %[[FOR_COND_CLEANUP_LOOPEXIT_UNR_LCSSA]]
21-
; APPLE: [[FOR_COND_CLEANUP_LOOPEXIT_UNR_LCSSA]]:
22-
; APPLE-NEXT: [[IV_UNR:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[IV_UNR_PH]], %[[FOR_COND_CLEANUP_LOOPEXIT_UNR_LCSSA_LOOPEXIT]] ]
17+
; APPLE-NEXT: br label %[[FOR_COND_CLEANUP_UNR_LCSSA]]
18+
; APPLE: [[FOR_COND_CLEANUP_UNR_LCSSA]]:
19+
; APPLE-NEXT: [[IV_UNR:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[IV_UNR_PH]], %[[FOR_COND_CLEANUP_UNR_LCSSA_LOOPEXIT]] ]
2320
; APPLE-NEXT: [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0
24-
; APPLE-NEXT: br i1 [[LCMP_MOD]], label %[[FOR_BODY_EPIL_PREHEADER:.*]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]]
21+
; APPLE-NEXT: br i1 [[LCMP_MOD]], label %[[FOR_BODY_EPIL_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]]
2522
; APPLE: [[FOR_BODY_EPIL_PREHEADER]]:
2623
; APPLE-NEXT: br label %[[FOR_BODY_EPIL:.*]]
2724
; APPLE: [[FOR_BODY_EPIL]]:
@@ -36,10 +33,8 @@ define void @reverse(ptr %dst, ptr %src, i64 %len) {
3633
; APPLE-NEXT: [[EXITCOND_NOT_EPIL:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_EPIL]], [[LEN]]
3734
; APPLE-NEXT: [[EPIL_ITER_NEXT]] = add i64 [[EPIL_ITER]], 1
3835
; APPLE-NEXT: [[EPIL_ITER_CMP:%.*]] = icmp ne i64 [[EPIL_ITER_NEXT]], [[XTRAITER]]
39-
; APPLE-NEXT: br i1 [[EPIL_ITER_CMP]], label %[[FOR_BODY_EPIL]], label %[[FOR_COND_CLEANUP_LOOPEXIT_EPILOG_LCSSA:.*]], !llvm.loop [[LOOP0:![0-9]+]]
40-
; APPLE: [[FOR_COND_CLEANUP_LOOPEXIT_EPILOG_LCSSA]]:
41-
; APPLE-NEXT: br label %[[FOR_COND_CLEANUP_LOOPEXIT]]
42-
; APPLE: [[FOR_COND_CLEANUP_LOOPEXIT]]:
36+
; APPLE-NEXT: br i1 [[EPIL_ITER_CMP]], label %[[FOR_BODY_EPIL]], label %[[FOR_COND_CLEANUP_EPILOG_LCSSA:.*]], !llvm.loop [[LOOP0:![0-9]+]]
37+
; APPLE: [[FOR_COND_CLEANUP_EPILOG_LCSSA]]:
4338
; APPLE-NEXT: br label %[[FOR_COND_CLEANUP]]
4439
; APPLE: [[FOR_COND_CLEANUP]]:
4540
; APPLE-NEXT: ret void
@@ -96,18 +91,13 @@ define void @reverse(ptr %dst, ptr %src, i64 %len) {
9691
; APPLE-NEXT: [[INDVARS_IV_NEXT_7]] = add nuw nsw i64 [[INDVARS_IV]], 8
9792
; APPLE-NEXT: [[NITER_NEXT_7]] = add i64 [[NITER]], 8
9893
; APPLE-NEXT: [[NITER_NCMP_7:%.*]] = icmp eq i64 [[NITER_NEXT_7]], [[UNROLL_ITER]]
99-
; APPLE-NEXT: br i1 [[NITER_NCMP_7]], label %[[FOR_COND_CLEANUP_LOOPEXIT_UNR_LCSSA_LOOPEXIT]], label %[[FOR_BODY]]
94+
; APPLE-NEXT: br i1 [[NITER_NCMP_7]], label %[[FOR_COND_CLEANUP_UNR_LCSSA_LOOPEXIT]], label %[[FOR_BODY]]
10095
;
10196
; GENERIC-LABEL: define void @reverse(
10297
; GENERIC-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], i64 [[LEN:%.*]]) {
103-
; GENERIC-NEXT: [[ENTRY:.*:]]
104-
; GENERIC-NEXT: [[CMP7:%.*]] = icmp sgt i64 [[LEN]], 0
105-
; GENERIC-NEXT: br i1 [[CMP7]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]]
106-
; GENERIC: [[FOR_BODY_PREHEADER]]:
98+
; GENERIC-NEXT: [[FOR_BODY_PREHEADER:.*]]:
10799
; GENERIC-NEXT: br label %[[FOR_BODY:.*]]
108-
; GENERIC: [[FOR_COND_CLEANUP_LOOPEXIT:.*]]:
109-
; GENERIC-NEXT: br label %[[FOR_COND_CLEANUP]]
110-
; GENERIC: [[FOR_COND_CLEANUP]]:
100+
; GENERIC: [[FOR_COND_CLEANUP:.*]]:
111101
; GENERIC-NEXT: ret void
112102
; GENERIC: [[FOR_BODY]]:
113103
; GENERIC-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
@@ -118,12 +108,8 @@ define void @reverse(ptr %dst, ptr %src, i64 %len) {
118108
; GENERIC-NEXT: store <4 x float> [[TMP1]], ptr [[ARRAYIDX2]], align 16
119109
; GENERIC-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
120110
; GENERIC-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[LEN]]
121-
; GENERIC-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP_LOOPEXIT]], label %[[FOR_BODY]]
111+
; GENERIC-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
122112
;
123-
entry:
124-
%cmp7 = icmp sgt i64 %len, 0
125-
br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup
126-
127113
for.body.preheader: ; preds = %entry
128114
br label %for.body
129115

@@ -142,7 +128,131 @@ for.body: ; preds = %for.body.preheader,
142128
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
143129
}
144130

131+
132+
; RUN: opt -p loop-unroll -mtriple=arm64-apple-macosx -mcpu=apple-m1 -S %s | FileCheck --check-prefix=APPLE %s
133+
; RUN: opt -p loop-unroll -S %s -mtriple aarch64 | FileCheck %s -check-prefix=GENERIC
134+
; *** IR Dump Before LoopUnrollPass on _Z21saxpy_tripcount1K_av1PfPKff ***
135+
define void @saxpy_tripcount1K_av1(ptr %dst, ptr %src, float %a) {
136+
; APPLE-LABEL: define void @saxpy_tripcount1K_av1(
137+
; APPLE-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], float [[A:%.*]]) #[[ATTR0]] {
138+
; APPLE-NEXT: [[ENTRY:.*]]:
139+
; APPLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[A]], i64 0
140+
; APPLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
141+
; APPLE-NEXT: br label %[[VECTOR_BODY:.*]]
142+
; APPLE: [[VECTOR_BODY]]:
143+
; APPLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
144+
; APPLE-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX]]
145+
; APPLE-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 16
146+
; APPLE-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 32
147+
; APPLE-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 48
148+
; APPLE-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP0]], align 4
149+
; APPLE-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
150+
; APPLE-NEXT: [[WIDE_LOAD13:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
151+
; APPLE-NEXT: [[WIDE_LOAD14:%.*]] = load <4 x float>, ptr [[TMP3]], align 4
152+
; APPLE-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX]]
153+
; APPLE-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP4]], i64 16
154+
; APPLE-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP4]], i64 32
155+
; APPLE-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP4]], i64 48
156+
; APPLE-NEXT: [[WIDE_LOAD15:%.*]] = load <4 x float>, ptr [[TMP4]], align 4
157+
; APPLE-NEXT: [[WIDE_LOAD16:%.*]] = load <4 x float>, ptr [[TMP5]], align 4
158+
; APPLE-NEXT: [[WIDE_LOAD17:%.*]] = load <4 x float>, ptr [[TMP6]], align 4
159+
; APPLE-NEXT: [[WIDE_LOAD18:%.*]] = load <4 x float>, ptr [[TMP7]], align 4
160+
; APPLE-NEXT: [[TMP8:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD15]])
161+
; APPLE-NEXT: [[TMP9:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD12]], <4 x float> [[WIDE_LOAD16]])
162+
; APPLE-NEXT: [[TMP10:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD13]], <4 x float> [[WIDE_LOAD17]])
163+
; APPLE-NEXT: [[TMP11:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD14]], <4 x float> [[WIDE_LOAD18]])
164+
; APPLE-NEXT: store <4 x float> [[TMP8]], ptr [[TMP4]], align 4
165+
; APPLE-NEXT: store <4 x float> [[TMP9]], ptr [[TMP5]], align 4
166+
; APPLE-NEXT: store <4 x float> [[TMP10]], ptr [[TMP6]], align 4
167+
; APPLE-NEXT: store <4 x float> [[TMP11]], ptr [[TMP7]], align 4
168+
; APPLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
169+
; APPLE-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
170+
; APPLE-NEXT: br i1 [[TMP12]], label %[[FOR_COND_CLEANUP:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
171+
; APPLE: [[FOR_COND_CLEANUP]]:
172+
; APPLE-NEXT: ret void
173+
;
174+
; GENERIC-LABEL: define void @saxpy_tripcount1K_av1(
175+
; GENERIC-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], float [[A:%.*]]) {
176+
; GENERIC-NEXT: [[ENTRY:.*]]:
177+
; GENERIC-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[A]], i64 0
178+
; GENERIC-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
179+
; GENERIC-NEXT: br label %[[VECTOR_BODY:.*]]
180+
; GENERIC: [[VECTOR_BODY]]:
181+
; GENERIC-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
182+
; GENERIC-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX]]
183+
; GENERIC-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 16
184+
; GENERIC-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 32
185+
; GENERIC-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 48
186+
; GENERIC-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP0]], align 4
187+
; GENERIC-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
188+
; GENERIC-NEXT: [[WIDE_LOAD13:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
189+
; GENERIC-NEXT: [[WIDE_LOAD14:%.*]] = load <4 x float>, ptr [[TMP3]], align 4
190+
; GENERIC-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX]]
191+
; GENERIC-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP4]], i64 16
192+
; GENERIC-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP4]], i64 32
193+
; GENERIC-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP4]], i64 48
194+
; GENERIC-NEXT: [[WIDE_LOAD15:%.*]] = load <4 x float>, ptr [[TMP4]], align 4
195+
; GENERIC-NEXT: [[WIDE_LOAD16:%.*]] = load <4 x float>, ptr [[TMP5]], align 4
196+
; GENERIC-NEXT: [[WIDE_LOAD17:%.*]] = load <4 x float>, ptr [[TMP6]], align 4
197+
; GENERIC-NEXT: [[WIDE_LOAD18:%.*]] = load <4 x float>, ptr [[TMP7]], align 4
198+
; GENERIC-NEXT: [[TMP8:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD15]])
199+
; GENERIC-NEXT: [[TMP9:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD12]], <4 x float> [[WIDE_LOAD16]])
200+
; GENERIC-NEXT: [[TMP10:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD13]], <4 x float> [[WIDE_LOAD17]])
201+
; GENERIC-NEXT: [[TMP11:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD14]], <4 x float> [[WIDE_LOAD18]])
202+
; GENERIC-NEXT: store <4 x float> [[TMP8]], ptr [[TMP4]], align 4
203+
; GENERIC-NEXT: store <4 x float> [[TMP9]], ptr [[TMP5]], align 4
204+
; GENERIC-NEXT: store <4 x float> [[TMP10]], ptr [[TMP6]], align 4
205+
; GENERIC-NEXT: store <4 x float> [[TMP11]], ptr [[TMP7]], align 4
206+
; GENERIC-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
207+
; GENERIC-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
208+
; GENERIC-NEXT: br i1 [[TMP12]], label %[[FOR_COND_CLEANUP:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
209+
; GENERIC: [[FOR_COND_CLEANUP]]:
210+
; GENERIC-NEXT: ret void
211+
;
212+
entry:
213+
%broadcast.splatinsert = insertelement <4 x float> poison, float %a, i64 0
214+
%broadcast.splat = shufflevector <4 x float> %broadcast.splatinsert, <4 x float> poison, <4 x i32> zeroinitializer
215+
br label %vector.body
216+
217+
vector.body: ; preds = %vector.body, %entry
218+
%index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
219+
%0 = getelementptr inbounds nuw float, ptr %src, i64 %index
220+
%1 = getelementptr inbounds nuw i8, ptr %0, i64 16
221+
%2 = getelementptr inbounds nuw i8, ptr %0, i64 32
222+
%3 = getelementptr inbounds nuw i8, ptr %0, i64 48
223+
%wide.load = load <4 x float>, ptr %0, align 4
224+
%wide.load12 = load <4 x float>, ptr %1, align 4
225+
%wide.load13 = load <4 x float>, ptr %2, align 4
226+
%wide.load14 = load <4 x float>, ptr %3, align 4
227+
%4 = getelementptr inbounds nuw float, ptr %dst, i64 %index
228+
%5 = getelementptr inbounds nuw i8, ptr %4, i64 16
229+
%6 = getelementptr inbounds nuw i8, ptr %4, i64 32
230+
%7 = getelementptr inbounds nuw i8, ptr %4, i64 48
231+
%wide.load15 = load <4 x float>, ptr %4, align 4
232+
%wide.load16 = load <4 x float>, ptr %5, align 4
233+
%wide.load17 = load <4 x float>, ptr %6, align 4
234+
%wide.load18 = load <4 x float>, ptr %7, align 4
235+
%8 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %broadcast.splat, <4 x float> %wide.load, <4 x float> %wide.load15)
236+
%9 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %broadcast.splat, <4 x float> %wide.load12, <4 x float> %wide.load16)
237+
%10 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %broadcast.splat, <4 x float> %wide.load13, <4 x float> %wide.load17)
238+
%11 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %broadcast.splat, <4 x float> %wide.load14, <4 x float> %wide.load18)
239+
store <4 x float> %8, ptr %4, align 4
240+
store <4 x float> %9, ptr %5, align 4
241+
store <4 x float> %10, ptr %6, align 4
242+
store <4 x float> %11, ptr %7, align 4
243+
%index.next = add nuw i64 %index, 16
244+
%12 = icmp eq i64 %index.next, 1024
245+
br i1 %12, label %for.cond.cleanup, label %vector.body, !llvm.loop !22
246+
247+
for.cond.cleanup: ; preds = %vector.body
248+
ret void
249+
}
250+
!22 = !{!"llvm.loop.isvectorized", i32 1}
251+
145252
;.
146253
; APPLE: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]}
147254
; APPLE: [[META1]] = !{!"llvm.loop.unroll.disable"}
255+
; APPLE: [[LOOP2]] = !{!"llvm.loop.isvectorized", i32 1}
256+
;.
257+
; GENERIC: [[LOOP0]] = !{!"llvm.loop.isvectorized", i32 1}
148258
;.

0 commit comments

Comments
 (0)