You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
[LLVM][CodeGen][SVE] Use BFMLALB for promoted bfloat fma operations.
NOTE: From what I can see LLVM has no support for FEAT_AFP in terms of
feature detection or ACLE builtins and so I believe the compiler can
(and does) work under the assumption the feature is not enabled.
define <vscale x 8 x bfloat> @fmla_nxv8bf16(<vscale x 8 x bfloat> %acc, <vscale x 8 x bfloat> %m1, <vscale x 8 x bfloat> %m2) {
8
8
; SVE-LABEL: fmla_nxv8bf16:
9
9
; SVE: // %bb.0:
10
-
; SVE-NEXT: uunpkhi z3.s, z2.h
11
-
; SVE-NEXT: uunpkhi z4.s, z1.h
12
-
; SVE-NEXT: uunpkhi z5.s, z0.h
10
+
; SVE-NEXT: uunpkhi z3.s, z0.h
11
+
; SVE-NEXT: uunpklo z0.s, z0.h
12
+
; SVE-NEXT: uunpkhi z4.s, z2.h
13
+
; SVE-NEXT: uunpkhi z5.s, z1.h
13
14
; SVE-NEXT: uunpklo z2.s, z2.h
14
15
; SVE-NEXT: uunpklo z1.s, z1.h
15
-
; SVE-NEXT: uunpklo z0.s, z0.h
16
16
; SVE-NEXT: ptrue p0.s
17
17
; SVE-NEXT: lsl z3.s, z3.s, #16
18
-
; SVE-NEXT: lsl z4.s, z4.s, #16
19
-
; SVE-NEXT: lsl z5.s, z5.s, #16
20
-
; SVE-NEXT: lsl z2.s, z2.s, #16
21
-
; SVE-NEXT: lsl z1.s, z1.s, #16
22
18
; SVE-NEXT: lsl z0.s, z0.s, #16
23
-
; SVE-NEXT: fmad z3.s, p0/m, z4.s, z5.s
24
-
; SVE-NEXT: fmla z0.s, p0/m, z1.s, z2.s
19
+
; SVE-NEXT: bfmlalb z3.s, z5.h, z4.h
20
+
; SVE-NEXT: bfmlalb z0.s, z1.h, z2.h
25
21
; SVE-NEXT: bfcvt z1.h, p0/m, z3.s
26
22
; SVE-NEXT: bfcvt z0.h, p0/m, z0.s
27
23
; SVE-NEXT: uzp1 z0.h, z0.h, z1.h
@@ -40,11 +36,9 @@ define <vscale x 8 x bfloat> @fmla_nxv8bf16(<vscale x 8 x bfloat> %acc, <vscale
40
36
define <vscale x 4 x bfloat> @fmla_nxv4bf16(<vscale x 4 x bfloat> %acc, <vscale x 4 x bfloat> %m1, <vscale x 4 x bfloat> %m2) {
41
37
; SVE-LABEL: fmla_nxv4bf16:
42
38
; SVE: // %bb.0:
43
-
; SVE-NEXT: lsl z2.s, z2.s, #16
44
-
; SVE-NEXT: lsl z1.s, z1.s, #16
45
39
; SVE-NEXT: lsl z0.s, z0.s, #16
46
40
; SVE-NEXT: ptrue p0.s
47
-
; SVE-NEXT: fmla z0.s, p0/m, z1.s, z2.s
41
+
; SVE-NEXT: bfmlalb z0.s, z1.h, z2.h
48
42
; SVE-NEXT: bfcvt z0.h, p0/m, z0.s
49
43
; SVE-NEXT: ret
50
44
;
@@ -83,22 +77,18 @@ define <vscale x 8 x bfloat> @fmls_nxv8bf16(<vscale x 8 x bfloat> %acc, <vscale
83
77
; SVE-LABEL: fmls_nxv8bf16:
84
78
; SVE: // %bb.0:
85
79
; SVE-NEXT: ptrue p0.h
86
-
; SVE-NEXT: uunpkhi z3.s, z2.h
87
-
; SVE-NEXT: uunpkhi z4.s, z0.h
88
-
; SVE-NEXT: uunpklo z2.s, z2.h
80
+
; SVE-NEXT: uunpkhi z3.s, z0.h
89
81
; SVE-NEXT: uunpklo z0.s, z0.h
82
+
; SVE-NEXT: uunpkhi z5.s, z2.h
83
+
; SVE-NEXT: uunpklo z2.s, z2.h
90
84
; SVE-NEXT: fneg z1.h, p0/m, z1.h
91
85
; SVE-NEXT: ptrue p0.s
92
86
; SVE-NEXT: lsl z3.s, z3.s, #16
93
-
; SVE-NEXT: lsl z4.s, z4.s, #16
94
-
; SVE-NEXT: lsl z2.s, z2.s, #16
95
87
; SVE-NEXT: lsl z0.s, z0.s, #16
96
-
; SVE-NEXT: uunpkhi z5.s, z1.h
88
+
; SVE-NEXT: uunpkhi z4.s, z1.h
97
89
; SVE-NEXT: uunpklo z1.s, z1.h
98
-
; SVE-NEXT: lsl z5.s, z5.s, #16
99
-
; SVE-NEXT: lsl z1.s, z1.s, #16
100
-
; SVE-NEXT: fmad z3.s, p0/m, z5.s, z4.s
101
-
; SVE-NEXT: fmla z0.s, p0/m, z1.s, z2.s
90
+
; SVE-NEXT: bfmlalb z3.s, z4.h, z5.h
91
+
; SVE-NEXT: bfmlalb z0.s, z1.h, z2.h
102
92
; SVE-NEXT: bfcvt z1.h, p0/m, z3.s
103
93
; SVE-NEXT: bfcvt z0.h, p0/m, z0.s
104
94
; SVE-NEXT: uzp1 z0.h, z0.h, z1.h
@@ -118,11 +108,9 @@ define <vscale x 4 x bfloat> @fmls_nxv4bf16(<vscale x 4 x bfloat> %acc, <vscale
118
108
; SVE-LABEL: fmls_nxv4bf16:
119
109
; SVE: // %bb.0:
120
110
; SVE-NEXT: ptrue p0.s
121
-
; SVE-NEXT: lsl z2.s, z2.s, #16
122
111
; SVE-NEXT: lsl z0.s, z0.s, #16
123
112
; SVE-NEXT: fneg z1.h, p0/m, z1.h
124
-
; SVE-NEXT: lsl z1.s, z1.s, #16
125
-
; SVE-NEXT: fmla z0.s, p0/m, z1.s, z2.s
113
+
; SVE-NEXT: bfmlalb z0.s, z1.h, z2.h
126
114
; SVE-NEXT: bfcvt z0.h, p0/m, z0.s
127
115
; SVE-NEXT: ret
128
116
;
@@ -161,24 +149,20 @@ define <vscale x 2 x bfloat> @fmls_nxv2bf16(<vscale x 2 x bfloat> %acc, <vscale
161
149
define <vscale x 8 x bfloat> @fmla_sel_nxv8bf16(<vscale x 8 x i1> %pred, <vscale x 8 x bfloat> %acc, <vscale x 8 x bfloat> %m1, <vscale x 8 x bfloat> %m2) {
162
150
; SVE-LABEL: fmla_sel_nxv8bf16:
163
151
; SVE: // %bb.0:
164
-
; SVE-NEXT: uunpkhi z3.s, z2.h
165
-
; SVE-NEXT: uunpkhi z4.s, z1.h
166
-
; SVE-NEXT: uunpkhi z5.s, z0.h
152
+
; SVE-NEXT: uunpkhi z3.s, z0.h
153
+
; SVE-NEXT: uunpklo z4.s, z0.h
154
+
; SVE-NEXT: uunpkhi z5.s, z2.h
155
+
; SVE-NEXT: uunpkhi z6.s, z1.h
167
156
; SVE-NEXT: uunpklo z2.s, z2.h
168
157
; SVE-NEXT: uunpklo z1.s, z1.h
169
-
; SVE-NEXT: uunpklo z6.s, z0.h
170
158
; SVE-NEXT: ptrue p1.s
171
159
; SVE-NEXT: lsl z3.s, z3.s, #16
172
160
; SVE-NEXT: lsl z4.s, z4.s, #16
173
-
; SVE-NEXT: lsl z5.s, z5.s, #16
174
-
; SVE-NEXT: lsl z2.s, z2.s, #16
175
-
; SVE-NEXT: lsl z1.s, z1.s, #16
176
-
; SVE-NEXT: lsl z6.s, z6.s, #16
177
-
; SVE-NEXT: fmad z3.s, p1/m, z4.s, z5.s
178
-
; SVE-NEXT: fmad z1.s, p1/m, z2.s, z6.s
179
-
; SVE-NEXT: bfcvt z2.h, p1/m, z3.s
180
-
; SVE-NEXT: bfcvt z1.h, p1/m, z1.s
181
-
; SVE-NEXT: uzp1 z1.h, z1.h, z2.h
161
+
; SVE-NEXT: bfmlalb z3.s, z6.h, z5.h
162
+
; SVE-NEXT: bfmlalb z4.s, z1.h, z2.h
163
+
; SVE-NEXT: bfcvt z1.h, p1/m, z3.s
164
+
; SVE-NEXT: bfcvt z2.h, p1/m, z4.s
165
+
; SVE-NEXT: uzp1 z1.h, z2.h, z1.h
182
166
; SVE-NEXT: mov z0.h, p0/m, z1.h
183
167
; SVE-NEXT: ret
184
168
;
@@ -195,12 +179,9 @@ define <vscale x 8 x bfloat> @fmla_sel_nxv8bf16(<vscale x 8 x i1> %pred, <vscale
195
179
define <vscale x 4 x bfloat> @fmla_sel_nxv4bf16(<vscale x 4 x i1> %pred, <vscale x 4 x bfloat> %acc, <vscale x 4 x bfloat> %m1, <vscale x 4 x bfloat> %m2) {
196
180
; SVE-LABEL: fmla_sel_nxv4bf16:
197
181
; SVE: // %bb.0:
198
-
; SVE-NEXT: lsl z2.s, z2.s, #16
199
-
; SVE-NEXT: lsl z1.s, z1.s, #16
200
182
; SVE-NEXT: lsl z3.s, z0.s, #16
201
-
; SVE-NEXT: ptrue p1.s
202
-
; SVE-NEXT: fmad z1.s, p1/m, z2.s, z3.s
203
-
; SVE-NEXT: bfcvt z0.h, p0/m, z1.s
183
+
; SVE-NEXT: bfmlalb z3.s, z1.h, z2.h
184
+
; SVE-NEXT: bfcvt z0.h, p0/m, z3.s
204
185
; SVE-NEXT: ret
205
186
;
206
187
; SVE-B16B16-LABEL: fmla_sel_nxv4bf16:
@@ -238,25 +219,21 @@ define <vscale x 8 x bfloat> @fmls_sel_nxv8bf16(<vscale x 8 x i1> %pred, <vscale
238
219
; SVE-LABEL: fmls_sel_nxv8bf16:
239
220
; SVE: // %bb.0:
240
221
; SVE-NEXT: ptrue p1.h
241
-
; SVE-NEXT: uunpkhi z3.s, z2.h
242
-
; SVE-NEXT: uunpkhi z4.s, z0.h
222
+
; SVE-NEXT: uunpkhi z3.s, z0.h
223
+
; SVE-NEXT: uunpklo z4.s, z0.h
224
+
; SVE-NEXT: uunpkhi z6.s, z2.h
243
225
; SVE-NEXT: uunpklo z2.s, z2.h
244
-
; SVE-NEXT: uunpklo z6.s, z0.h
245
226
; SVE-NEXT: fneg z1.h, p1/m, z1.h
246
227
; SVE-NEXT: ptrue p1.s
247
228
; SVE-NEXT: lsl z3.s, z3.s, #16
248
229
; SVE-NEXT: lsl z4.s, z4.s, #16
249
-
; SVE-NEXT: lsl z2.s, z2.s, #16
250
-
; SVE-NEXT: lsl z6.s, z6.s, #16
251
230
; SVE-NEXT: uunpkhi z5.s, z1.h
252
231
; SVE-NEXT: uunpklo z1.s, z1.h
253
-
; SVE-NEXT: lsl z5.s, z5.s, #16
254
-
; SVE-NEXT: lsl z1.s, z1.s, #16
255
-
; SVE-NEXT: fmad z3.s, p1/m, z5.s, z4.s
256
-
; SVE-NEXT: fmad z1.s, p1/m, z2.s, z6.s
257
-
; SVE-NEXT: bfcvt z2.h, p1/m, z3.s
258
-
; SVE-NEXT: bfcvt z1.h, p1/m, z1.s
259
-
; SVE-NEXT: uzp1 z1.h, z1.h, z2.h
232
+
; SVE-NEXT: bfmlalb z3.s, z5.h, z6.h
233
+
; SVE-NEXT: bfmlalb z4.s, z1.h, z2.h
234
+
; SVE-NEXT: bfcvt z1.h, p1/m, z3.s
235
+
; SVE-NEXT: bfcvt z2.h, p1/m, z4.s
236
+
; SVE-NEXT: uzp1 z1.h, z2.h, z1.h
260
237
; SVE-NEXT: mov z0.h, p0/m, z1.h
261
238
; SVE-NEXT: ret
262
239
;
@@ -274,12 +251,10 @@ define <vscale x 4 x bfloat> @fmls_sel_nxv4bf16(<vscale x 4 x i1> %pred, <vscale
0 commit comments