Skip to content

Commit a3c2ac6

Browse files
committed
shortened the test & cleanup entry/exit labels
1 parent 1553324 commit a3c2ac6

File tree

1 file changed

+64
-121
lines changed
  • llvm/test/Transforms/LoopUnroll/AArch64

1 file changed

+64
-121
lines changed

llvm/test/Transforms/LoopUnroll/AArch64/vector.ll

Lines changed: 64 additions & 121 deletions
Original file line numberDiff line numberDiff line change
@@ -4,43 +4,17 @@
44
define void @reverse(ptr %dst, ptr %src, i64 %len) {
55
; APPLE-LABEL: define void @reverse(
66
; APPLE-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], i64 [[LEN:%.*]]) #[[ATTR0:[0-9]+]] {
7-
; APPLE-NEXT: [[FOR_BODY_PREHEADER:.*]]:
7+
; APPLE-NEXT: [[ENTRY:.*]]:
88
; APPLE-NEXT: [[TMP5:%.*]] = add i64 [[LEN]], -1
99
; APPLE-NEXT: [[XTRAITER:%.*]] = and i64 [[LEN]], 7
1010
; APPLE-NEXT: [[TMP6:%.*]] = icmp ult i64 [[TMP5]], 7
11-
; APPLE-NEXT: br i1 [[TMP6]], label %[[FOR_COND_CLEANUP_UNR_LCSSA:.*]], label %[[FOR_BODY_PREHEADER_NEW:.*]]
12-
; APPLE: [[FOR_BODY_PREHEADER_NEW]]:
11+
; APPLE-NEXT: br i1 [[TMP6]], label %[[EXIT_UNR_LCSSA:.*]], label %[[ENTRY_NEW:.*]]
12+
; APPLE: [[ENTRY_NEW]]:
1313
; APPLE-NEXT: [[UNROLL_ITER:%.*]] = sub i64 [[LEN]], [[XTRAITER]]
1414
; APPLE-NEXT: br label %[[FOR_BODY:.*]]
15-
; APPLE: [[FOR_COND_CLEANUP_UNR_LCSSA_LOOPEXIT:.*]]:
16-
; APPLE-NEXT: [[IV_UNR_PH:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_7:%.*]], %[[FOR_BODY]] ]
17-
; APPLE-NEXT: br label %[[FOR_COND_CLEANUP_UNR_LCSSA]]
18-
; APPLE: [[FOR_COND_CLEANUP_UNR_LCSSA]]:
19-
; APPLE-NEXT: [[IV_UNR:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[IV_UNR_PH]], %[[FOR_COND_CLEANUP_UNR_LCSSA_LOOPEXIT]] ]
20-
; APPLE-NEXT: [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0
21-
; APPLE-NEXT: br i1 [[LCMP_MOD]], label %[[FOR_BODY_EPIL_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]]
22-
; APPLE: [[FOR_BODY_EPIL_PREHEADER]]:
23-
; APPLE-NEXT: br label %[[FOR_BODY_EPIL:.*]]
24-
; APPLE: [[FOR_BODY_EPIL]]:
25-
; APPLE-NEXT: [[INDVARS_IV_EPIL:%.*]] = phi i64 [ [[IV_UNR]], %[[FOR_BODY_EPIL_PREHEADER]] ], [ [[INDVARS_IV_NEXT_EPIL:%.*]], %[[FOR_BODY_EPIL]] ]
26-
; APPLE-NEXT: [[EPIL_ITER:%.*]] = phi i64 [ 0, %[[FOR_BODY_EPIL_PREHEADER]] ], [ [[EPIL_ITER_NEXT:%.*]], %[[FOR_BODY_EPIL]] ]
27-
; APPLE-NEXT: [[TMP3:%.*]] = sub nsw i64 [[LEN]], [[INDVARS_IV_EPIL]]
28-
; APPLE-NEXT: [[ARRAYIDX_EPIL:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP3]]
29-
; APPLE-NEXT: [[TMP4:%.*]] = load <4 x float>, ptr [[ARRAYIDX_EPIL]], align 16
30-
; APPLE-NEXT: [[ARRAYIDX2_EPIL:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[INDVARS_IV_EPIL]]
31-
; APPLE-NEXT: store <4 x float> [[TMP4]], ptr [[ARRAYIDX2_EPIL]], align 16
32-
; APPLE-NEXT: [[INDVARS_IV_NEXT_EPIL]] = add nuw nsw i64 [[INDVARS_IV_EPIL]], 1
33-
; APPLE-NEXT: [[EXITCOND_NOT_EPIL:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_EPIL]], [[LEN]]
34-
; APPLE-NEXT: [[EPIL_ITER_NEXT]] = add i64 [[EPIL_ITER]], 1
35-
; APPLE-NEXT: [[EPIL_ITER_CMP:%.*]] = icmp ne i64 [[EPIL_ITER_NEXT]], [[XTRAITER]]
36-
; APPLE-NEXT: br i1 [[EPIL_ITER_CMP]], label %[[FOR_BODY_EPIL]], label %[[FOR_COND_CLEANUP_EPILOG_LCSSA:.*]], !llvm.loop [[LOOP0:![0-9]+]]
37-
; APPLE: [[FOR_COND_CLEANUP_EPILOG_LCSSA]]:
38-
; APPLE-NEXT: br label %[[FOR_COND_CLEANUP]]
39-
; APPLE: [[FOR_COND_CLEANUP]]:
40-
; APPLE-NEXT: ret void
4115
; APPLE: [[FOR_BODY]]:
42-
; APPLE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER_NEW]] ], [ [[INDVARS_IV_NEXT_7]], %[[FOR_BODY]] ]
43-
; APPLE-NEXT: [[NITER:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER_NEW]] ], [ [[NITER_NEXT_7:%.*]], %[[FOR_BODY]] ]
16+
; APPLE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[INDVARS_IV_NEXT_7:%.*]], %[[FOR_BODY]] ]
17+
; APPLE-NEXT: [[NITER:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[NITER_NEXT_7:%.*]], %[[FOR_BODY]] ]
4418
; APPLE-NEXT: [[TMP1:%.*]] = sub nsw i64 [[LEN]], [[INDVARS_IV]]
4519
; APPLE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP1]]
4620
; APPLE-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16
@@ -91,47 +65,70 @@ define void @reverse(ptr %dst, ptr %src, i64 %len) {
9165
; APPLE-NEXT: [[INDVARS_IV_NEXT_7]] = add nuw nsw i64 [[INDVARS_IV]], 8
9266
; APPLE-NEXT: [[NITER_NEXT_7]] = add i64 [[NITER]], 8
9367
; APPLE-NEXT: [[NITER_NCMP_7:%.*]] = icmp eq i64 [[NITER_NEXT_7]], [[UNROLL_ITER]]
94-
; APPLE-NEXT: br i1 [[NITER_NCMP_7]], label %[[FOR_COND_CLEANUP_UNR_LCSSA_LOOPEXIT]], label %[[FOR_BODY]]
68+
; APPLE-NEXT: br i1 [[NITER_NCMP_7]], label %[[EXIT_UNR_LCSSA_LOOPEXIT:.*]], label %[[FOR_BODY]]
69+
; APPLE: [[EXIT_UNR_LCSSA_LOOPEXIT]]:
70+
; APPLE-NEXT: [[IV_UNR_PH:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_7]], %[[FOR_BODY]] ]
71+
; APPLE-NEXT: br label %[[EXIT_UNR_LCSSA]]
72+
; APPLE: [[EXIT_UNR_LCSSA]]:
73+
; APPLE-NEXT: [[IV_UNR:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_UNR_PH]], %[[EXIT_UNR_LCSSA_LOOPEXIT]] ]
74+
; APPLE-NEXT: [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0
75+
; APPLE-NEXT: br i1 [[LCMP_MOD]], label %[[FOR_BODY_EPIL_PREHEADER:.*]], label %[[EXIT:.*]]
76+
; APPLE: [[FOR_BODY_EPIL_PREHEADER]]:
77+
; APPLE-NEXT: br label %[[FOR_BODY_EPIL:.*]]
78+
; APPLE: [[FOR_BODY_EPIL]]:
79+
; APPLE-NEXT: [[IV_EPIL:%.*]] = phi i64 [ [[IV_UNR]], %[[FOR_BODY_EPIL_PREHEADER]] ], [ [[IV_NEXT_EPIL:%.*]], %[[FOR_BODY_EPIL]] ]
80+
; APPLE-NEXT: [[EPIL_ITER:%.*]] = phi i64 [ 0, %[[FOR_BODY_EPIL_PREHEADER]] ], [ [[EPIL_ITER_NEXT:%.*]], %[[FOR_BODY_EPIL]] ]
81+
; APPLE-NEXT: [[TMP21:%.*]] = sub nsw i64 [[LEN]], [[IV_EPIL]]
82+
; APPLE-NEXT: [[ARRAYIDX_EPIL:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP21]]
83+
; APPLE-NEXT: [[TMP22:%.*]] = load <4 x float>, ptr [[ARRAYIDX_EPIL]], align 16
84+
; APPLE-NEXT: [[ARRAYIDX2_EPIL:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_EPIL]]
85+
; APPLE-NEXT: store <4 x float> [[TMP22]], ptr [[ARRAYIDX2_EPIL]], align 16
86+
; APPLE-NEXT: [[IV_NEXT_EPIL]] = add nuw nsw i64 [[IV_EPIL]], 1
87+
; APPLE-NEXT: [[EXITCOND_NOT_EPIL:%.*]] = icmp eq i64 [[IV_NEXT_EPIL]], [[LEN]]
88+
; APPLE-NEXT: [[EPIL_ITER_NEXT]] = add i64 [[EPIL_ITER]], 1
89+
; APPLE-NEXT: [[EPIL_ITER_CMP:%.*]] = icmp ne i64 [[EPIL_ITER_NEXT]], [[XTRAITER]]
90+
; APPLE-NEXT: br i1 [[EPIL_ITER_CMP]], label %[[FOR_BODY_EPIL]], label %[[EXIT_EPILOG_LCSSA:.*]], !llvm.loop [[LOOP0:![0-9]+]]
91+
; APPLE: [[EXIT_EPILOG_LCSSA]]:
92+
; APPLE-NEXT: br label %[[EXIT]]
93+
; APPLE: [[EXIT]]:
94+
; APPLE-NEXT: ret void
9595
;
9696
; GENERIC-LABEL: define void @reverse(
9797
; GENERIC-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], i64 [[LEN:%.*]]) {
98-
; GENERIC-NEXT: [[FOR_BODY_PREHEADER:.*]]:
98+
; GENERIC-NEXT: [[ENTRY:.*]]:
9999
; GENERIC-NEXT: br label %[[FOR_BODY:.*]]
100-
; GENERIC: [[FOR_COND_CLEANUP:.*]]:
101-
; GENERIC-NEXT: ret void
102100
; GENERIC: [[FOR_BODY]]:
103-
; GENERIC-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
101+
; GENERIC-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
104102
; GENERIC-NEXT: [[TMP0:%.*]] = sub nsw i64 [[LEN]], [[INDVARS_IV]]
105103
; GENERIC-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP0]]
106104
; GENERIC-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16
107105
; GENERIC-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[INDVARS_IV]]
108106
; GENERIC-NEXT: store <4 x float> [[TMP1]], ptr [[ARRAYIDX2]], align 16
109107
; GENERIC-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
110108
; GENERIC-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[LEN]]
111-
; GENERIC-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
109+
; GENERIC-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]]
110+
; GENERIC: [[EXIT]]:
111+
; GENERIC-NEXT: ret void
112112
;
113-
for.body.preheader: ; preds = %entry
113+
entry: ; preds = %entry
114114
br label %for.body
115115

116-
for.cond.cleanup: ; preds = %for.body, %entry
117-
ret void
118-
119-
for.body: ; preds = %for.body.preheader, %for.body
120-
%iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.body ]
116+
for.body: ; preds = %entry, %for.body
117+
%iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
121118
%1 = sub nsw i64 %len, %iv
122119
%arrayidx = getelementptr inbounds <4 x float>, ptr %src, i64 %1
123120
%2 = load <4 x float>, ptr %arrayidx, align 16
124121
%arrayidx2 = getelementptr inbounds nuw <4 x float>, ptr %dst, i64 %iv
125122
store <4 x float> %2, ptr %arrayidx2, align 16
126123
%iv.next = add nuw nsw i64 %iv, 1
127124
%exitcond.not = icmp eq i64 %iv.next, %len
128-
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
125+
br i1 %exitcond.not, label %exit, label %for.body
126+
127+
exit: ; preds = %for.body, %entry
128+
ret void
129129
}
130130

131131

132-
; RUN: opt -p loop-unroll -mtriple=arm64-apple-macosx -mcpu=apple-m1 -S %s | FileCheck --check-prefix=APPLE %s
133-
; RUN: opt -p loop-unroll -S %s -mtriple aarch64 | FileCheck %s -check-prefix=GENERIC
134-
; *** IR Dump Before LoopUnrollPass on _Z21saxpy_tripcount1K_av1PfPKff ***
135132
define void @saxpy_tripcount1K_av1(ptr %dst, ptr %src, float %a) {
136133
; APPLE-LABEL: define void @saxpy_tripcount1K_av1(
137134
; APPLE-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], float [[A:%.*]]) #[[ATTR0]] {
@@ -142,33 +139,15 @@ define void @saxpy_tripcount1K_av1(ptr %dst, ptr %src, float %a) {
142139
; APPLE: [[VECTOR_BODY]]:
143140
; APPLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
144141
; APPLE-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX]]
145-
; APPLE-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 16
146-
; APPLE-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 32
147-
; APPLE-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 48
148-
; APPLE-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP0]], align 4
149-
; APPLE-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
150-
; APPLE-NEXT: [[WIDE_LOAD13:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
151-
; APPLE-NEXT: [[WIDE_LOAD14:%.*]] = load <4 x float>, ptr [[TMP3]], align 4
142+
; APPLE-NEXT: [[WIDE_LOAD14:%.*]] = load <4 x float>, ptr [[TMP0]], align 4
152143
; APPLE-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX]]
153-
; APPLE-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP4]], i64 16
154-
; APPLE-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP4]], i64 32
155-
; APPLE-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP4]], i64 48
156-
; APPLE-NEXT: [[WIDE_LOAD15:%.*]] = load <4 x float>, ptr [[TMP4]], align 4
157-
; APPLE-NEXT: [[WIDE_LOAD16:%.*]] = load <4 x float>, ptr [[TMP5]], align 4
158-
; APPLE-NEXT: [[WIDE_LOAD17:%.*]] = load <4 x float>, ptr [[TMP6]], align 4
159-
; APPLE-NEXT: [[WIDE_LOAD18:%.*]] = load <4 x float>, ptr [[TMP7]], align 4
160-
; APPLE-NEXT: [[TMP8:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD15]])
161-
; APPLE-NEXT: [[TMP9:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD12]], <4 x float> [[WIDE_LOAD16]])
162-
; APPLE-NEXT: [[TMP10:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD13]], <4 x float> [[WIDE_LOAD17]])
144+
; APPLE-NEXT: [[WIDE_LOAD18:%.*]] = load <4 x float>, ptr [[TMP4]], align 4
163145
; APPLE-NEXT: [[TMP11:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD14]], <4 x float> [[WIDE_LOAD18]])
164-
; APPLE-NEXT: store <4 x float> [[TMP8]], ptr [[TMP4]], align 4
165-
; APPLE-NEXT: store <4 x float> [[TMP9]], ptr [[TMP5]], align 4
166-
; APPLE-NEXT: store <4 x float> [[TMP10]], ptr [[TMP6]], align 4
167-
; APPLE-NEXT: store <4 x float> [[TMP11]], ptr [[TMP7]], align 4
168-
; APPLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
146+
; APPLE-NEXT: store <4 x float> [[TMP11]], ptr [[TMP4]], align 4
147+
; APPLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
169148
; APPLE-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
170-
; APPLE-NEXT: br i1 [[TMP12]], label %[[FOR_COND_CLEANUP:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
171-
; APPLE: [[FOR_COND_CLEANUP]]:
149+
; APPLE-NEXT: br i1 [[TMP12]], label %[[EXIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
150+
; APPLE: [[EXIT]]:
172151
; APPLE-NEXT: ret void
173152
;
174153
; GENERIC-LABEL: define void @saxpy_tripcount1K_av1(
@@ -180,33 +159,15 @@ define void @saxpy_tripcount1K_av1(ptr %dst, ptr %src, float %a) {
180159
; GENERIC: [[VECTOR_BODY]]:
181160
; GENERIC-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
182161
; GENERIC-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX]]
183-
; GENERIC-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 16
184-
; GENERIC-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 32
185-
; GENERIC-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 48
186-
; GENERIC-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP0]], align 4
187-
; GENERIC-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
188-
; GENERIC-NEXT: [[WIDE_LOAD13:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
189-
; GENERIC-NEXT: [[WIDE_LOAD14:%.*]] = load <4 x float>, ptr [[TMP3]], align 4
162+
; GENERIC-NEXT: [[WIDE_LOAD14:%.*]] = load <4 x float>, ptr [[TMP0]], align 4
190163
; GENERIC-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX]]
191-
; GENERIC-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP4]], i64 16
192-
; GENERIC-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP4]], i64 32
193-
; GENERIC-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP4]], i64 48
194-
; GENERIC-NEXT: [[WIDE_LOAD15:%.*]] = load <4 x float>, ptr [[TMP4]], align 4
195-
; GENERIC-NEXT: [[WIDE_LOAD16:%.*]] = load <4 x float>, ptr [[TMP5]], align 4
196-
; GENERIC-NEXT: [[WIDE_LOAD17:%.*]] = load <4 x float>, ptr [[TMP6]], align 4
197-
; GENERIC-NEXT: [[WIDE_LOAD18:%.*]] = load <4 x float>, ptr [[TMP7]], align 4
198-
; GENERIC-NEXT: [[TMP8:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD15]])
199-
; GENERIC-NEXT: [[TMP9:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD12]], <4 x float> [[WIDE_LOAD16]])
200-
; GENERIC-NEXT: [[TMP10:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD13]], <4 x float> [[WIDE_LOAD17]])
164+
; GENERIC-NEXT: [[WIDE_LOAD18:%.*]] = load <4 x float>, ptr [[TMP4]], align 4
201165
; GENERIC-NEXT: [[TMP11:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD14]], <4 x float> [[WIDE_LOAD18]])
202-
; GENERIC-NEXT: store <4 x float> [[TMP8]], ptr [[TMP4]], align 4
203-
; GENERIC-NEXT: store <4 x float> [[TMP9]], ptr [[TMP5]], align 4
204-
; GENERIC-NEXT: store <4 x float> [[TMP10]], ptr [[TMP6]], align 4
205-
; GENERIC-NEXT: store <4 x float> [[TMP11]], ptr [[TMP7]], align 4
206-
; GENERIC-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
166+
; GENERIC-NEXT: store <4 x float> [[TMP11]], ptr [[TMP4]], align 4
167+
; GENERIC-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
207168
; GENERIC-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
208-
; GENERIC-NEXT: br i1 [[TMP12]], label %[[FOR_COND_CLEANUP:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
209-
; GENERIC: [[FOR_COND_CLEANUP]]:
169+
; GENERIC-NEXT: br i1 [[TMP12]], label %[[EXIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
170+
; GENERIC: [[EXIT]]:
210171
; GENERIC-NEXT: ret void
211172
;
212173
entry:
@@ -217,37 +178,19 @@ entry:
217178
vector.body: ; preds = %vector.body, %entry
218179
%index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
219180
%0 = getelementptr inbounds nuw float, ptr %src, i64 %index
220-
%1 = getelementptr inbounds nuw i8, ptr %0, i64 16
221-
%2 = getelementptr inbounds nuw i8, ptr %0, i64 32
222-
%3 = getelementptr inbounds nuw i8, ptr %0, i64 48
223181
%wide.load = load <4 x float>, ptr %0, align 4
182+
%1 = getelementptr inbounds nuw float, ptr %dst, i64 %index
224183
%wide.load12 = load <4 x float>, ptr %1, align 4
225-
%wide.load13 = load <4 x float>, ptr %2, align 4
226-
%wide.load14 = load <4 x float>, ptr %3, align 4
227-
%4 = getelementptr inbounds nuw float, ptr %dst, i64 %index
228-
%5 = getelementptr inbounds nuw i8, ptr %4, i64 16
229-
%6 = getelementptr inbounds nuw i8, ptr %4, i64 32
230-
%7 = getelementptr inbounds nuw i8, ptr %4, i64 48
231-
%wide.load15 = load <4 x float>, ptr %4, align 4
232-
%wide.load16 = load <4 x float>, ptr %5, align 4
233-
%wide.load17 = load <4 x float>, ptr %6, align 4
234-
%wide.load18 = load <4 x float>, ptr %7, align 4
235-
%8 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %broadcast.splat, <4 x float> %wide.load, <4 x float> %wide.load15)
236-
%9 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %broadcast.splat, <4 x float> %wide.load12, <4 x float> %wide.load16)
237-
%10 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %broadcast.splat, <4 x float> %wide.load13, <4 x float> %wide.load17)
238-
%11 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %broadcast.splat, <4 x float> %wide.load14, <4 x float> %wide.load18)
239-
store <4 x float> %8, ptr %4, align 4
240-
store <4 x float> %9, ptr %5, align 4
241-
store <4 x float> %10, ptr %6, align 4
242-
store <4 x float> %11, ptr %7, align 4
243-
%index.next = add nuw i64 %index, 16
244-
%12 = icmp eq i64 %index.next, 1024
245-
br i1 %12, label %for.cond.cleanup, label %vector.body, !llvm.loop !22
184+
%2 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %broadcast.splat, <4 x float> %wide.load, <4 x float> %wide.load12)
185+
store <4 x float> %2, ptr %1, align 4
186+
%index.next = add nuw i64 %index, 4
187+
%3 = icmp eq i64 %index.next, 1024
188+
br i1 %3, label %exit, label %vector.body, !llvm.loop !0
246189

247-
for.cond.cleanup: ; preds = %vector.body
190+
exit: ; preds = %vector.body
248191
ret void
249192
}
250-
!22 = !{!"llvm.loop.isvectorized", i32 1}
193+
!0 = !{!"llvm.loop.isvectorized", i32 1}
251194

252195
;.
253196
; APPLE: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]}

0 commit comments

Comments
 (0)