44define void @reverse (ptr %dst , ptr %src , i64 %len ) {
55; APPLE-LABEL: define void @reverse(
66; APPLE-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], i64 [[LEN:%.*]]) #[[ATTR0:[0-9]+]] {
7- ; APPLE-NEXT: [[ENTRY:.*:]]
8- ; APPLE-NEXT: [[CMP7:%.*]] = icmp sgt i64 [[LEN]], 0
9- ; APPLE-NEXT: br i1 [[CMP7]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]]
10- ; APPLE: [[FOR_BODY_PREHEADER]]:
7+ ; APPLE-NEXT: [[FOR_BODY_PREHEADER:.*]]:
118; APPLE-NEXT: [[TMP5:%.*]] = add i64 [[LEN]], -1
129; APPLE-NEXT: [[XTRAITER:%.*]] = and i64 [[LEN]], 7
1310; APPLE-NEXT: [[TMP6:%.*]] = icmp ult i64 [[TMP5]], 7
14- ; APPLE-NEXT: br i1 [[TMP6]], label %[[FOR_COND_CLEANUP_LOOPEXIT_UNR_LCSSA :.*]], label %[[FOR_BODY_PREHEADER_NEW:.*]]
11+ ; APPLE-NEXT: br i1 [[TMP6]], label %[[FOR_COND_CLEANUP_UNR_LCSSA :.*]], label %[[FOR_BODY_PREHEADER_NEW:.*]]
1512; APPLE: [[FOR_BODY_PREHEADER_NEW]]:
1613; APPLE-NEXT: [[UNROLL_ITER:%.*]] = sub i64 [[LEN]], [[XTRAITER]]
1714; APPLE-NEXT: br label %[[FOR_BODY:.*]]
18- ; APPLE: [[FOR_COND_CLEANUP_LOOPEXIT_UNR_LCSSA_LOOPEXIT :.*]]:
15+ ; APPLE: [[FOR_COND_CLEANUP_UNR_LCSSA_LOOPEXIT :.*]]:
1916; APPLE-NEXT: [[IV_UNR_PH:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_7:%.*]], %[[FOR_BODY]] ]
20- ; APPLE-NEXT: br label %[[FOR_COND_CLEANUP_LOOPEXIT_UNR_LCSSA ]]
21- ; APPLE: [[FOR_COND_CLEANUP_LOOPEXIT_UNR_LCSSA ]]:
22- ; APPLE-NEXT: [[IV_UNR:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[IV_UNR_PH]], %[[FOR_COND_CLEANUP_LOOPEXIT_UNR_LCSSA_LOOPEXIT ]] ]
17+ ; APPLE-NEXT: br label %[[FOR_COND_CLEANUP_UNR_LCSSA ]]
18+ ; APPLE: [[FOR_COND_CLEANUP_UNR_LCSSA ]]:
19+ ; APPLE-NEXT: [[IV_UNR:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[IV_UNR_PH]], %[[FOR_COND_CLEANUP_UNR_LCSSA_LOOPEXIT ]] ]
2320; APPLE-NEXT: [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0
24- ; APPLE-NEXT: br i1 [[LCMP_MOD]], label %[[FOR_BODY_EPIL_PREHEADER:.*]], label %[[FOR_COND_CLEANUP_LOOPEXIT :.*]]
21+ ; APPLE-NEXT: br i1 [[LCMP_MOD]], label %[[FOR_BODY_EPIL_PREHEADER:.*]], label %[[FOR_COND_CLEANUP :.*]]
2522; APPLE: [[FOR_BODY_EPIL_PREHEADER]]:
2623; APPLE-NEXT: br label %[[FOR_BODY_EPIL:.*]]
2724; APPLE: [[FOR_BODY_EPIL]]:
@@ -36,10 +33,8 @@ define void @reverse(ptr %dst, ptr %src, i64 %len) {
3633; APPLE-NEXT: [[EXITCOND_NOT_EPIL:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_EPIL]], [[LEN]]
3734; APPLE-NEXT: [[EPIL_ITER_NEXT]] = add i64 [[EPIL_ITER]], 1
3835; APPLE-NEXT: [[EPIL_ITER_CMP:%.*]] = icmp ne i64 [[EPIL_ITER_NEXT]], [[XTRAITER]]
39- ; APPLE-NEXT: br i1 [[EPIL_ITER_CMP]], label %[[FOR_BODY_EPIL]], label %[[FOR_COND_CLEANUP_LOOPEXIT_EPILOG_LCSSA:.*]], !llvm.loop [[LOOP0:![0-9]+]]
40- ; APPLE: [[FOR_COND_CLEANUP_LOOPEXIT_EPILOG_LCSSA]]:
41- ; APPLE-NEXT: br label %[[FOR_COND_CLEANUP_LOOPEXIT]]
42- ; APPLE: [[FOR_COND_CLEANUP_LOOPEXIT]]:
36+ ; APPLE-NEXT: br i1 [[EPIL_ITER_CMP]], label %[[FOR_BODY_EPIL]], label %[[FOR_COND_CLEANUP_EPILOG_LCSSA:.*]], !llvm.loop [[LOOP0:![0-9]+]]
37+ ; APPLE: [[FOR_COND_CLEANUP_EPILOG_LCSSA]]:
4338; APPLE-NEXT: br label %[[FOR_COND_CLEANUP]]
4439; APPLE: [[FOR_COND_CLEANUP]]:
4540; APPLE-NEXT: ret void
@@ -96,18 +91,13 @@ define void @reverse(ptr %dst, ptr %src, i64 %len) {
9691; APPLE-NEXT: [[INDVARS_IV_NEXT_7]] = add nuw nsw i64 [[INDVARS_IV]], 8
9792; APPLE-NEXT: [[NITER_NEXT_7]] = add i64 [[NITER]], 8
9893; APPLE-NEXT: [[NITER_NCMP_7:%.*]] = icmp eq i64 [[NITER_NEXT_7]], [[UNROLL_ITER]]
99- ; APPLE-NEXT: br i1 [[NITER_NCMP_7]], label %[[FOR_COND_CLEANUP_LOOPEXIT_UNR_LCSSA_LOOPEXIT ]], label %[[FOR_BODY]]
94+ ; APPLE-NEXT: br i1 [[NITER_NCMP_7]], label %[[FOR_COND_CLEANUP_UNR_LCSSA_LOOPEXIT ]], label %[[FOR_BODY]]
10095;
10196; GENERIC-LABEL: define void @reverse(
10297; GENERIC-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], i64 [[LEN:%.*]]) {
103- ; GENERIC-NEXT: [[ENTRY:.*:]]
104- ; GENERIC-NEXT: [[CMP7:%.*]] = icmp sgt i64 [[LEN]], 0
105- ; GENERIC-NEXT: br i1 [[CMP7]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]]
106- ; GENERIC: [[FOR_BODY_PREHEADER]]:
98+ ; GENERIC-NEXT: [[FOR_BODY_PREHEADER:.*]]:
10799; GENERIC-NEXT: br label %[[FOR_BODY:.*]]
108- ; GENERIC: [[FOR_COND_CLEANUP_LOOPEXIT:.*]]:
109- ; GENERIC-NEXT: br label %[[FOR_COND_CLEANUP]]
110- ; GENERIC: [[FOR_COND_CLEANUP]]:
100+ ; GENERIC: [[FOR_COND_CLEANUP:.*]]:
111101; GENERIC-NEXT: ret void
112102; GENERIC: [[FOR_BODY]]:
113103; GENERIC-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
@@ -118,12 +108,8 @@ define void @reverse(ptr %dst, ptr %src, i64 %len) {
118108; GENERIC-NEXT: store <4 x float> [[TMP1]], ptr [[ARRAYIDX2]], align 16
119109; GENERIC-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
120110; GENERIC-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[LEN]]
121- ; GENERIC-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP_LOOPEXIT ]], label %[[FOR_BODY]]
111+ ; GENERIC-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP ]], label %[[FOR_BODY]]
122112;
123- entry:
124- %cmp7 = icmp sgt i64 %len , 0
125- br i1 %cmp7 , label %for.body.preheader , label %for.cond.cleanup
126-
127113for.body.preheader: ; preds = %entry
128114 br label %for.body
129115
@@ -142,7 +128,131 @@ for.body: ; preds = %for.body.preheader,
142128 br i1 %exitcond.not , label %for.cond.cleanup , label %for.body
143129}
144130
131+
132+ ; RUN: opt -p loop-unroll -mtriple=arm64-apple-macosx -mcpu=apple-m1 -S %s | FileCheck --check-prefix=APPLE %s
133+ ; RUN: opt -p loop-unroll -S %s -mtriple aarch64 | FileCheck %s -check-prefix=GENERIC
134+ ; *** IR Dump Before LoopUnrollPass on _Z21saxpy_tripcount1K_av1PfPKff ***
135+ define void @saxpy_tripcount1K_av1 (ptr %dst , ptr %src , float %a ) {
136+ ; APPLE-LABEL: define void @saxpy_tripcount1K_av1(
137+ ; APPLE-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], float [[A:%.*]]) #[[ATTR0]] {
138+ ; APPLE-NEXT: [[ENTRY:.*]]:
139+ ; APPLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[A]], i64 0
140+ ; APPLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
141+ ; APPLE-NEXT: br label %[[VECTOR_BODY:.*]]
142+ ; APPLE: [[VECTOR_BODY]]:
143+ ; APPLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
144+ ; APPLE-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX]]
145+ ; APPLE-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 16
146+ ; APPLE-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 32
147+ ; APPLE-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 48
148+ ; APPLE-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP0]], align 4
149+ ; APPLE-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
150+ ; APPLE-NEXT: [[WIDE_LOAD13:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
151+ ; APPLE-NEXT: [[WIDE_LOAD14:%.*]] = load <4 x float>, ptr [[TMP3]], align 4
152+ ; APPLE-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX]]
153+ ; APPLE-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP4]], i64 16
154+ ; APPLE-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP4]], i64 32
155+ ; APPLE-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP4]], i64 48
156+ ; APPLE-NEXT: [[WIDE_LOAD15:%.*]] = load <4 x float>, ptr [[TMP4]], align 4
157+ ; APPLE-NEXT: [[WIDE_LOAD16:%.*]] = load <4 x float>, ptr [[TMP5]], align 4
158+ ; APPLE-NEXT: [[WIDE_LOAD17:%.*]] = load <4 x float>, ptr [[TMP6]], align 4
159+ ; APPLE-NEXT: [[WIDE_LOAD18:%.*]] = load <4 x float>, ptr [[TMP7]], align 4
160+ ; APPLE-NEXT: [[TMP8:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD15]])
161+ ; APPLE-NEXT: [[TMP9:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD12]], <4 x float> [[WIDE_LOAD16]])
162+ ; APPLE-NEXT: [[TMP10:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD13]], <4 x float> [[WIDE_LOAD17]])
163+ ; APPLE-NEXT: [[TMP11:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD14]], <4 x float> [[WIDE_LOAD18]])
164+ ; APPLE-NEXT: store <4 x float> [[TMP8]], ptr [[TMP4]], align 4
165+ ; APPLE-NEXT: store <4 x float> [[TMP9]], ptr [[TMP5]], align 4
166+ ; APPLE-NEXT: store <4 x float> [[TMP10]], ptr [[TMP6]], align 4
167+ ; APPLE-NEXT: store <4 x float> [[TMP11]], ptr [[TMP7]], align 4
168+ ; APPLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
169+ ; APPLE-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
170+ ; APPLE-NEXT: br i1 [[TMP12]], label %[[FOR_COND_CLEANUP:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
171+ ; APPLE: [[FOR_COND_CLEANUP]]:
172+ ; APPLE-NEXT: ret void
173+ ;
174+ ; GENERIC-LABEL: define void @saxpy_tripcount1K_av1(
175+ ; GENERIC-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], float [[A:%.*]]) {
176+ ; GENERIC-NEXT: [[ENTRY:.*]]:
177+ ; GENERIC-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[A]], i64 0
178+ ; GENERIC-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
179+ ; GENERIC-NEXT: br label %[[VECTOR_BODY:.*]]
180+ ; GENERIC: [[VECTOR_BODY]]:
181+ ; GENERIC-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
182+ ; GENERIC-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX]]
183+ ; GENERIC-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 16
184+ ; GENERIC-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 32
185+ ; GENERIC-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 48
186+ ; GENERIC-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP0]], align 4
187+ ; GENERIC-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
188+ ; GENERIC-NEXT: [[WIDE_LOAD13:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
189+ ; GENERIC-NEXT: [[WIDE_LOAD14:%.*]] = load <4 x float>, ptr [[TMP3]], align 4
190+ ; GENERIC-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX]]
191+ ; GENERIC-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP4]], i64 16
192+ ; GENERIC-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP4]], i64 32
193+ ; GENERIC-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP4]], i64 48
194+ ; GENERIC-NEXT: [[WIDE_LOAD15:%.*]] = load <4 x float>, ptr [[TMP4]], align 4
195+ ; GENERIC-NEXT: [[WIDE_LOAD16:%.*]] = load <4 x float>, ptr [[TMP5]], align 4
196+ ; GENERIC-NEXT: [[WIDE_LOAD17:%.*]] = load <4 x float>, ptr [[TMP6]], align 4
197+ ; GENERIC-NEXT: [[WIDE_LOAD18:%.*]] = load <4 x float>, ptr [[TMP7]], align 4
198+ ; GENERIC-NEXT: [[TMP8:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD15]])
199+ ; GENERIC-NEXT: [[TMP9:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD12]], <4 x float> [[WIDE_LOAD16]])
200+ ; GENERIC-NEXT: [[TMP10:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD13]], <4 x float> [[WIDE_LOAD17]])
201+ ; GENERIC-NEXT: [[TMP11:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD14]], <4 x float> [[WIDE_LOAD18]])
202+ ; GENERIC-NEXT: store <4 x float> [[TMP8]], ptr [[TMP4]], align 4
203+ ; GENERIC-NEXT: store <4 x float> [[TMP9]], ptr [[TMP5]], align 4
204+ ; GENERIC-NEXT: store <4 x float> [[TMP10]], ptr [[TMP6]], align 4
205+ ; GENERIC-NEXT: store <4 x float> [[TMP11]], ptr [[TMP7]], align 4
206+ ; GENERIC-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
207+ ; GENERIC-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
208+ ; GENERIC-NEXT: br i1 [[TMP12]], label %[[FOR_COND_CLEANUP:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
209+ ; GENERIC: [[FOR_COND_CLEANUP]]:
210+ ; GENERIC-NEXT: ret void
211+ ;
212+ entry:
213+ %broadcast.splatinsert = insertelement <4 x float > poison, float %a , i64 0
214+ %broadcast.splat = shufflevector <4 x float > %broadcast.splatinsert , <4 x float > poison, <4 x i32 > zeroinitializer
215+ br label %vector.body
216+
217+ vector.body: ; preds = %vector.body, %entry
218+ %index = phi i64 [ 0 , %entry ], [ %index.next , %vector.body ]
219+ %0 = getelementptr inbounds nuw float , ptr %src , i64 %index
220+ %1 = getelementptr inbounds nuw i8 , ptr %0 , i64 16
221+ %2 = getelementptr inbounds nuw i8 , ptr %0 , i64 32
222+ %3 = getelementptr inbounds nuw i8 , ptr %0 , i64 48
223+ %wide.load = load <4 x float >, ptr %0 , align 4
224+ %wide.load12 = load <4 x float >, ptr %1 , align 4
225+ %wide.load13 = load <4 x float >, ptr %2 , align 4
226+ %wide.load14 = load <4 x float >, ptr %3 , align 4
227+ %4 = getelementptr inbounds nuw float , ptr %dst , i64 %index
228+ %5 = getelementptr inbounds nuw i8 , ptr %4 , i64 16
229+ %6 = getelementptr inbounds nuw i8 , ptr %4 , i64 32
230+ %7 = getelementptr inbounds nuw i8 , ptr %4 , i64 48
231+ %wide.load15 = load <4 x float >, ptr %4 , align 4
232+ %wide.load16 = load <4 x float >, ptr %5 , align 4
233+ %wide.load17 = load <4 x float >, ptr %6 , align 4
234+ %wide.load18 = load <4 x float >, ptr %7 , align 4
235+ %8 = call <4 x float > @llvm.fmuladd.v4f32 (<4 x float > %broadcast.splat , <4 x float > %wide.load , <4 x float > %wide.load15 )
236+ %9 = call <4 x float > @llvm.fmuladd.v4f32 (<4 x float > %broadcast.splat , <4 x float > %wide.load12 , <4 x float > %wide.load16 )
237+ %10 = call <4 x float > @llvm.fmuladd.v4f32 (<4 x float > %broadcast.splat , <4 x float > %wide.load13 , <4 x float > %wide.load17 )
238+ %11 = call <4 x float > @llvm.fmuladd.v4f32 (<4 x float > %broadcast.splat , <4 x float > %wide.load14 , <4 x float > %wide.load18 )
239+ store <4 x float > %8 , ptr %4 , align 4
240+ store <4 x float > %9 , ptr %5 , align 4
241+ store <4 x float > %10 , ptr %6 , align 4
242+ store <4 x float > %11 , ptr %7 , align 4
243+ %index.next = add nuw i64 %index , 16
244+ %12 = icmp eq i64 %index.next , 1024
245+ br i1 %12 , label %for.cond.cleanup , label %vector.body , !llvm.loop !22
246+
247+ for.cond.cleanup: ; preds = %vector.body
248+ ret void
249+ }
250+ !22 = !{!"llvm.loop.isvectorized" , i32 1 }
251+
145252;.
146253; APPLE: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]}
147254; APPLE: [[META1]] = !{!"llvm.loop.unroll.disable"}
255+ ; APPLE: [[LOOP2]] = !{!"llvm.loop.isvectorized", i32 1}
256+ ;.
257+ ; GENERIC: [[LOOP0]] = !{!"llvm.loop.isvectorized", i32 1}
148258;.
0 commit comments