44define void @reverse (ptr %dst , ptr %src , i64 %len ) {
55; APPLE-LABEL: define void @reverse(
66; APPLE-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], i64 [[LEN:%.*]]) #[[ATTR0:[0-9]+]] {
7- ; APPLE-NEXT: [[FOR_BODY_PREHEADER :.*]]:
7+ ; APPLE-NEXT: [[ENTRY :.*]]:
88; APPLE-NEXT: [[TMP5:%.*]] = add i64 [[LEN]], -1
99; APPLE-NEXT: [[XTRAITER:%.*]] = and i64 [[LEN]], 7
1010; APPLE-NEXT: [[TMP6:%.*]] = icmp ult i64 [[TMP5]], 7
11- ; APPLE-NEXT: br i1 [[TMP6]], label %[[FOR_COND_CLEANUP_UNR_LCSSA :.*]], label %[[FOR_BODY_PREHEADER_NEW :.*]]
12- ; APPLE: [[FOR_BODY_PREHEADER_NEW ]]:
11+ ; APPLE-NEXT: br i1 [[TMP6]], label %[[EXIT_UNR_LCSSA :.*]], label %[[ENTRY_NEW :.*]]
12+ ; APPLE: [[ENTRY_NEW ]]:
1313; APPLE-NEXT: [[UNROLL_ITER:%.*]] = sub i64 [[LEN]], [[XTRAITER]]
1414; APPLE-NEXT: br label %[[FOR_BODY:.*]]
15- ; APPLE: [[FOR_COND_CLEANUP_UNR_LCSSA_LOOPEXIT:.*]]:
16- ; APPLE-NEXT: [[IV_UNR_PH:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_7:%.*]], %[[FOR_BODY]] ]
17- ; APPLE-NEXT: br label %[[FOR_COND_CLEANUP_UNR_LCSSA]]
18- ; APPLE: [[FOR_COND_CLEANUP_UNR_LCSSA]]:
19- ; APPLE-NEXT: [[IV_UNR:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[IV_UNR_PH]], %[[FOR_COND_CLEANUP_UNR_LCSSA_LOOPEXIT]] ]
20- ; APPLE-NEXT: [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0
21- ; APPLE-NEXT: br i1 [[LCMP_MOD]], label %[[FOR_BODY_EPIL_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]]
22- ; APPLE: [[FOR_BODY_EPIL_PREHEADER]]:
23- ; APPLE-NEXT: br label %[[FOR_BODY_EPIL:.*]]
24- ; APPLE: [[FOR_BODY_EPIL]]:
25- ; APPLE-NEXT: [[INDVARS_IV_EPIL:%.*]] = phi i64 [ [[IV_UNR]], %[[FOR_BODY_EPIL_PREHEADER]] ], [ [[INDVARS_IV_NEXT_EPIL:%.*]], %[[FOR_BODY_EPIL]] ]
26- ; APPLE-NEXT: [[EPIL_ITER:%.*]] = phi i64 [ 0, %[[FOR_BODY_EPIL_PREHEADER]] ], [ [[EPIL_ITER_NEXT:%.*]], %[[FOR_BODY_EPIL]] ]
27- ; APPLE-NEXT: [[TMP3:%.*]] = sub nsw i64 [[LEN]], [[INDVARS_IV_EPIL]]
28- ; APPLE-NEXT: [[ARRAYIDX_EPIL:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP3]]
29- ; APPLE-NEXT: [[TMP4:%.*]] = load <4 x float>, ptr [[ARRAYIDX_EPIL]], align 16
30- ; APPLE-NEXT: [[ARRAYIDX2_EPIL:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[INDVARS_IV_EPIL]]
31- ; APPLE-NEXT: store <4 x float> [[TMP4]], ptr [[ARRAYIDX2_EPIL]], align 16
32- ; APPLE-NEXT: [[INDVARS_IV_NEXT_EPIL]] = add nuw nsw i64 [[INDVARS_IV_EPIL]], 1
33- ; APPLE-NEXT: [[EXITCOND_NOT_EPIL:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_EPIL]], [[LEN]]
34- ; APPLE-NEXT: [[EPIL_ITER_NEXT]] = add i64 [[EPIL_ITER]], 1
35- ; APPLE-NEXT: [[EPIL_ITER_CMP:%.*]] = icmp ne i64 [[EPIL_ITER_NEXT]], [[XTRAITER]]
36- ; APPLE-NEXT: br i1 [[EPIL_ITER_CMP]], label %[[FOR_BODY_EPIL]], label %[[FOR_COND_CLEANUP_EPILOG_LCSSA:.*]], !llvm.loop [[LOOP0:![0-9]+]]
37- ; APPLE: [[FOR_COND_CLEANUP_EPILOG_LCSSA]]:
38- ; APPLE-NEXT: br label %[[FOR_COND_CLEANUP]]
39- ; APPLE: [[FOR_COND_CLEANUP]]:
40- ; APPLE-NEXT: ret void
4115; APPLE: [[FOR_BODY]]:
42- ; APPLE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER_NEW ]] ], [ [[INDVARS_IV_NEXT_7]], %[[FOR_BODY]] ]
43- ; APPLE-NEXT: [[NITER:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER_NEW ]] ], [ [[NITER_NEXT_7:%.*]], %[[FOR_BODY]] ]
16+ ; APPLE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY_NEW ]] ], [ [[INDVARS_IV_NEXT_7:%.* ]], %[[FOR_BODY]] ]
17+ ; APPLE-NEXT: [[NITER:%.*]] = phi i64 [ 0, %[[ENTRY_NEW ]] ], [ [[NITER_NEXT_7:%.*]], %[[FOR_BODY]] ]
4418; APPLE-NEXT: [[TMP1:%.*]] = sub nsw i64 [[LEN]], [[INDVARS_IV]]
4519; APPLE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP1]]
4620; APPLE-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16
@@ -91,47 +65,70 @@ define void @reverse(ptr %dst, ptr %src, i64 %len) {
9165; APPLE-NEXT: [[INDVARS_IV_NEXT_7]] = add nuw nsw i64 [[INDVARS_IV]], 8
9266; APPLE-NEXT: [[NITER_NEXT_7]] = add i64 [[NITER]], 8
9367; APPLE-NEXT: [[NITER_NCMP_7:%.*]] = icmp eq i64 [[NITER_NEXT_7]], [[UNROLL_ITER]]
94- ; APPLE-NEXT: br i1 [[NITER_NCMP_7]], label %[[FOR_COND_CLEANUP_UNR_LCSSA_LOOPEXIT]], label %[[FOR_BODY]]
68+ ; APPLE-NEXT: br i1 [[NITER_NCMP_7]], label %[[EXIT_UNR_LCSSA_LOOPEXIT:.*]], label %[[FOR_BODY]]
69+ ; APPLE: [[EXIT_UNR_LCSSA_LOOPEXIT]]:
70+ ; APPLE-NEXT: [[IV_UNR_PH:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_7]], %[[FOR_BODY]] ]
71+ ; APPLE-NEXT: br label %[[EXIT_UNR_LCSSA]]
72+ ; APPLE: [[EXIT_UNR_LCSSA]]:
73+ ; APPLE-NEXT: [[IV_UNR:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_UNR_PH]], %[[EXIT_UNR_LCSSA_LOOPEXIT]] ]
74+ ; APPLE-NEXT: [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0
75+ ; APPLE-NEXT: br i1 [[LCMP_MOD]], label %[[FOR_BODY_EPIL_PREHEADER:.*]], label %[[EXIT:.*]]
76+ ; APPLE: [[FOR_BODY_EPIL_PREHEADER]]:
77+ ; APPLE-NEXT: br label %[[FOR_BODY_EPIL:.*]]
78+ ; APPLE: [[FOR_BODY_EPIL]]:
79+ ; APPLE-NEXT: [[IV_EPIL:%.*]] = phi i64 [ [[IV_UNR]], %[[FOR_BODY_EPIL_PREHEADER]] ], [ [[IV_NEXT_EPIL:%.*]], %[[FOR_BODY_EPIL]] ]
80+ ; APPLE-NEXT: [[EPIL_ITER:%.*]] = phi i64 [ 0, %[[FOR_BODY_EPIL_PREHEADER]] ], [ [[EPIL_ITER_NEXT:%.*]], %[[FOR_BODY_EPIL]] ]
81+ ; APPLE-NEXT: [[TMP21:%.*]] = sub nsw i64 [[LEN]], [[IV_EPIL]]
82+ ; APPLE-NEXT: [[ARRAYIDX_EPIL:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP21]]
83+ ; APPLE-NEXT: [[TMP22:%.*]] = load <4 x float>, ptr [[ARRAYIDX_EPIL]], align 16
84+ ; APPLE-NEXT: [[ARRAYIDX2_EPIL:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_EPIL]]
85+ ; APPLE-NEXT: store <4 x float> [[TMP22]], ptr [[ARRAYIDX2_EPIL]], align 16
86+ ; APPLE-NEXT: [[IV_NEXT_EPIL]] = add nuw nsw i64 [[IV_EPIL]], 1
87+ ; APPLE-NEXT: [[EXITCOND_NOT_EPIL:%.*]] = icmp eq i64 [[IV_NEXT_EPIL]], [[LEN]]
88+ ; APPLE-NEXT: [[EPIL_ITER_NEXT]] = add i64 [[EPIL_ITER]], 1
89+ ; APPLE-NEXT: [[EPIL_ITER_CMP:%.*]] = icmp ne i64 [[EPIL_ITER_NEXT]], [[XTRAITER]]
90+ ; APPLE-NEXT: br i1 [[EPIL_ITER_CMP]], label %[[FOR_BODY_EPIL]], label %[[EXIT_EPILOG_LCSSA:.*]], !llvm.loop [[LOOP0:![0-9]+]]
91+ ; APPLE: [[EXIT_EPILOG_LCSSA]]:
92+ ; APPLE-NEXT: br label %[[EXIT]]
93+ ; APPLE: [[EXIT]]:
94+ ; APPLE-NEXT: ret void
9595;
9696; GENERIC-LABEL: define void @reverse(
9797; GENERIC-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], i64 [[LEN:%.*]]) {
98- ; GENERIC-NEXT: [[FOR_BODY_PREHEADER :.*]]:
98+ ; GENERIC-NEXT: [[ENTRY :.*]]:
9999; GENERIC-NEXT: br label %[[FOR_BODY:.*]]
100- ; GENERIC: [[FOR_COND_CLEANUP:.*]]:
101- ; GENERIC-NEXT: ret void
102100; GENERIC: [[FOR_BODY]]:
103- ; GENERIC-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER ]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
101+ ; GENERIC-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY ]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
104102; GENERIC-NEXT: [[TMP0:%.*]] = sub nsw i64 [[LEN]], [[INDVARS_IV]]
105103; GENERIC-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP0]]
106104; GENERIC-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16
107105; GENERIC-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[INDVARS_IV]]
108106; GENERIC-NEXT: store <4 x float> [[TMP1]], ptr [[ARRAYIDX2]], align 16
109107; GENERIC-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
110108; GENERIC-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[LEN]]
111- ; GENERIC-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
109+ ; GENERIC-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]]
110+ ; GENERIC: [[EXIT]]:
111+ ; GENERIC-NEXT: ret void
112112;
113- for.body.preheader : ; preds = %entry
113+ entry : ; preds = %entry
114114 br label %for.body
115115
116- for.cond.cleanup: ; preds = %for.body, %entry
117- ret void
118-
119- for.body: ; preds = %for.body.preheader, %for.body
120- %iv = phi i64 [ 0 , %for.body.preheader ], [ %iv.next , %for.body ]
116+ for.body: ; preds = %entry, %for.body
117+ %iv = phi i64 [ 0 , %entry ], [ %iv.next , %for.body ]
121118 %1 = sub nsw i64 %len , %iv
122119 %arrayidx = getelementptr inbounds <4 x float >, ptr %src , i64 %1
123120 %2 = load <4 x float >, ptr %arrayidx , align 16
124121 %arrayidx2 = getelementptr inbounds nuw <4 x float >, ptr %dst , i64 %iv
125122 store <4 x float > %2 , ptr %arrayidx2 , align 16
126123 %iv.next = add nuw nsw i64 %iv , 1
127124 %exitcond.not = icmp eq i64 %iv.next , %len
128- br i1 %exitcond.not , label %for.cond.cleanup , label %for.body
125+ br i1 %exitcond.not , label %exit , label %for.body
126+
127+ exit: ; preds = %for.body, %entry
128+ ret void
129129}
130130
131131
132- ; RUN: opt -p loop-unroll -mtriple=arm64-apple-macosx -mcpu=apple-m1 -S %s | FileCheck --check-prefix=APPLE %s
133- ; RUN: opt -p loop-unroll -S %s -mtriple aarch64 | FileCheck %s -check-prefix=GENERIC
134- ; *** IR Dump Before LoopUnrollPass on _Z21saxpy_tripcount1K_av1PfPKff ***
135132define void @saxpy_tripcount1K_av1 (ptr %dst , ptr %src , float %a ) {
136133; APPLE-LABEL: define void @saxpy_tripcount1K_av1(
137134; APPLE-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], float [[A:%.*]]) #[[ATTR0]] {
@@ -142,33 +139,15 @@ define void @saxpy_tripcount1K_av1(ptr %dst, ptr %src, float %a) {
142139; APPLE: [[VECTOR_BODY]]:
143140; APPLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
144141; APPLE-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX]]
145- ; APPLE-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 16
146- ; APPLE-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 32
147- ; APPLE-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 48
148- ; APPLE-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP0]], align 4
149- ; APPLE-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
150- ; APPLE-NEXT: [[WIDE_LOAD13:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
151- ; APPLE-NEXT: [[WIDE_LOAD14:%.*]] = load <4 x float>, ptr [[TMP3]], align 4
142+ ; APPLE-NEXT: [[WIDE_LOAD14:%.*]] = load <4 x float>, ptr [[TMP0]], align 4
152143; APPLE-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX]]
153- ; APPLE-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP4]], i64 16
154- ; APPLE-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP4]], i64 32
155- ; APPLE-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP4]], i64 48
156- ; APPLE-NEXT: [[WIDE_LOAD15:%.*]] = load <4 x float>, ptr [[TMP4]], align 4
157- ; APPLE-NEXT: [[WIDE_LOAD16:%.*]] = load <4 x float>, ptr [[TMP5]], align 4
158- ; APPLE-NEXT: [[WIDE_LOAD17:%.*]] = load <4 x float>, ptr [[TMP6]], align 4
159- ; APPLE-NEXT: [[WIDE_LOAD18:%.*]] = load <4 x float>, ptr [[TMP7]], align 4
160- ; APPLE-NEXT: [[TMP8:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD15]])
161- ; APPLE-NEXT: [[TMP9:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD12]], <4 x float> [[WIDE_LOAD16]])
162- ; APPLE-NEXT: [[TMP10:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD13]], <4 x float> [[WIDE_LOAD17]])
144+ ; APPLE-NEXT: [[WIDE_LOAD18:%.*]] = load <4 x float>, ptr [[TMP4]], align 4
163145; APPLE-NEXT: [[TMP11:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD14]], <4 x float> [[WIDE_LOAD18]])
164- ; APPLE-NEXT: store <4 x float> [[TMP8]], ptr [[TMP4]], align 4
165- ; APPLE-NEXT: store <4 x float> [[TMP9]], ptr [[TMP5]], align 4
166- ; APPLE-NEXT: store <4 x float> [[TMP10]], ptr [[TMP6]], align 4
167- ; APPLE-NEXT: store <4 x float> [[TMP11]], ptr [[TMP7]], align 4
168- ; APPLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
146+ ; APPLE-NEXT: store <4 x float> [[TMP11]], ptr [[TMP4]], align 4
147+ ; APPLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
169148; APPLE-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
170- ; APPLE-NEXT: br i1 [[TMP12]], label %[[FOR_COND_CLEANUP :.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
171- ; APPLE: [[FOR_COND_CLEANUP ]]:
149+ ; APPLE-NEXT: br i1 [[TMP12]], label %[[EXIT :.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
150+ ; APPLE: [[EXIT ]]:
172151; APPLE-NEXT: ret void
173152;
174153; GENERIC-LABEL: define void @saxpy_tripcount1K_av1(
@@ -180,33 +159,15 @@ define void @saxpy_tripcount1K_av1(ptr %dst, ptr %src, float %a) {
180159; GENERIC: [[VECTOR_BODY]]:
181160; GENERIC-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
182161; GENERIC-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX]]
183- ; GENERIC-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 16
184- ; GENERIC-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 32
185- ; GENERIC-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 48
186- ; GENERIC-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP0]], align 4
187- ; GENERIC-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
188- ; GENERIC-NEXT: [[WIDE_LOAD13:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
189- ; GENERIC-NEXT: [[WIDE_LOAD14:%.*]] = load <4 x float>, ptr [[TMP3]], align 4
162+ ; GENERIC-NEXT: [[WIDE_LOAD14:%.*]] = load <4 x float>, ptr [[TMP0]], align 4
190163; GENERIC-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX]]
191- ; GENERIC-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP4]], i64 16
192- ; GENERIC-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP4]], i64 32
193- ; GENERIC-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP4]], i64 48
194- ; GENERIC-NEXT: [[WIDE_LOAD15:%.*]] = load <4 x float>, ptr [[TMP4]], align 4
195- ; GENERIC-NEXT: [[WIDE_LOAD16:%.*]] = load <4 x float>, ptr [[TMP5]], align 4
196- ; GENERIC-NEXT: [[WIDE_LOAD17:%.*]] = load <4 x float>, ptr [[TMP6]], align 4
197- ; GENERIC-NEXT: [[WIDE_LOAD18:%.*]] = load <4 x float>, ptr [[TMP7]], align 4
198- ; GENERIC-NEXT: [[TMP8:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD15]])
199- ; GENERIC-NEXT: [[TMP9:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD12]], <4 x float> [[WIDE_LOAD16]])
200- ; GENERIC-NEXT: [[TMP10:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD13]], <4 x float> [[WIDE_LOAD17]])
164+ ; GENERIC-NEXT: [[WIDE_LOAD18:%.*]] = load <4 x float>, ptr [[TMP4]], align 4
201165; GENERIC-NEXT: [[TMP11:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD14]], <4 x float> [[WIDE_LOAD18]])
202- ; GENERIC-NEXT: store <4 x float> [[TMP8]], ptr [[TMP4]], align 4
203- ; GENERIC-NEXT: store <4 x float> [[TMP9]], ptr [[TMP5]], align 4
204- ; GENERIC-NEXT: store <4 x float> [[TMP10]], ptr [[TMP6]], align 4
205- ; GENERIC-NEXT: store <4 x float> [[TMP11]], ptr [[TMP7]], align 4
206- ; GENERIC-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
166+ ; GENERIC-NEXT: store <4 x float> [[TMP11]], ptr [[TMP4]], align 4
167+ ; GENERIC-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
207168; GENERIC-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
208- ; GENERIC-NEXT: br i1 [[TMP12]], label %[[FOR_COND_CLEANUP :.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
209- ; GENERIC: [[FOR_COND_CLEANUP ]]:
169+ ; GENERIC-NEXT: br i1 [[TMP12]], label %[[EXIT :.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
170+ ; GENERIC: [[EXIT ]]:
210171; GENERIC-NEXT: ret void
211172;
212173entry:
@@ -217,37 +178,19 @@ entry:
217178vector.body: ; preds = %vector.body, %entry
218179 %index = phi i64 [ 0 , %entry ], [ %index.next , %vector.body ]
219180 %0 = getelementptr inbounds nuw float , ptr %src , i64 %index
220- %1 = getelementptr inbounds nuw i8 , ptr %0 , i64 16
221- %2 = getelementptr inbounds nuw i8 , ptr %0 , i64 32
222- %3 = getelementptr inbounds nuw i8 , ptr %0 , i64 48
223181 %wide.load = load <4 x float >, ptr %0 , align 4
182+ %1 = getelementptr inbounds nuw float , ptr %dst , i64 %index
224183 %wide.load12 = load <4 x float >, ptr %1 , align 4
225- %wide.load13 = load <4 x float >, ptr %2 , align 4
226- %wide.load14 = load <4 x float >, ptr %3 , align 4
227- %4 = getelementptr inbounds nuw float , ptr %dst , i64 %index
228- %5 = getelementptr inbounds nuw i8 , ptr %4 , i64 16
229- %6 = getelementptr inbounds nuw i8 , ptr %4 , i64 32
230- %7 = getelementptr inbounds nuw i8 , ptr %4 , i64 48
231- %wide.load15 = load <4 x float >, ptr %4 , align 4
232- %wide.load16 = load <4 x float >, ptr %5 , align 4
233- %wide.load17 = load <4 x float >, ptr %6 , align 4
234- %wide.load18 = load <4 x float >, ptr %7 , align 4
235- %8 = call <4 x float > @llvm.fmuladd.v4f32 (<4 x float > %broadcast.splat , <4 x float > %wide.load , <4 x float > %wide.load15 )
236- %9 = call <4 x float > @llvm.fmuladd.v4f32 (<4 x float > %broadcast.splat , <4 x float > %wide.load12 , <4 x float > %wide.load16 )
237- %10 = call <4 x float > @llvm.fmuladd.v4f32 (<4 x float > %broadcast.splat , <4 x float > %wide.load13 , <4 x float > %wide.load17 )
238- %11 = call <4 x float > @llvm.fmuladd.v4f32 (<4 x float > %broadcast.splat , <4 x float > %wide.load14 , <4 x float > %wide.load18 )
239- store <4 x float > %8 , ptr %4 , align 4
240- store <4 x float > %9 , ptr %5 , align 4
241- store <4 x float > %10 , ptr %6 , align 4
242- store <4 x float > %11 , ptr %7 , align 4
243- %index.next = add nuw i64 %index , 16
244- %12 = icmp eq i64 %index.next , 1024
245- br i1 %12 , label %for.cond.cleanup , label %vector.body , !llvm.loop !22
184+ %2 = call <4 x float > @llvm.fmuladd.v4f32 (<4 x float > %broadcast.splat , <4 x float > %wide.load , <4 x float > %wide.load12 )
185+ store <4 x float > %2 , ptr %1 , align 4
186+ %index.next = add nuw i64 %index , 4
187+ %3 = icmp eq i64 %index.next , 1024
188+ br i1 %3 , label %exit , label %vector.body , !llvm.loop !0
246189
247- for.cond.cleanup : ; preds = %vector.body
190+ exit : ; preds = %vector.body
248191 ret void
249192}
250- !22 = !{!"llvm.loop.isvectorized" , i32 1 }
193+ !0 = !{!"llvm.loop.isvectorized" , i32 1 }
251194
252195;.
253196; APPLE: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]}
0 commit comments