Skip to content

Commit e0fd9b9

Browse files
[LLVM][CodeGen][SVE] Enable BFloat fma contraction more aggressively.
1 parent d0e82f2 commit e0fd9b9

File tree

2 files changed

+55
-95
lines changed

2 files changed

+55
-95
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18570,7 +18570,7 @@ bool AArch64TargetLowering::isFMAFasterThanFMulAndFAdd(
1857018570
case MVT::f64:
1857118571
return true;
1857218572
case MVT::bf16:
18573-
return VT.isScalableVector() && Subtarget->hasSVEB16B16() &&
18573+
return VT.isScalableVector() && Subtarget->hasBF16() &&
1857418574
Subtarget->isNonStreamingSVEorSME2Available();
1857518575
default:
1857618576
break;

llvm/test/CodeGen/AArch64/sve-bf16-combines.ll

Lines changed: 54 additions & 94 deletions
Original file line numberDiff line numberDiff line change
@@ -9,26 +9,20 @@ define <vscale x 8 x bfloat> @fmla_nxv8bf16(<vscale x 8 x bfloat> %acc, <vscale
99
; SVE: // %bb.0:
1010
; SVE-NEXT: uunpkhi z3.s, z2.h
1111
; SVE-NEXT: uunpkhi z4.s, z1.h
12+
; SVE-NEXT: uunpkhi z5.s, z0.h
1213
; SVE-NEXT: uunpklo z2.s, z2.h
1314
; SVE-NEXT: uunpklo z1.s, z1.h
15+
; SVE-NEXT: uunpklo z0.s, z0.h
1416
; SVE-NEXT: ptrue p0.s
1517
; SVE-NEXT: lsl z3.s, z3.s, #16
1618
; SVE-NEXT: lsl z4.s, z4.s, #16
19+
; SVE-NEXT: lsl z5.s, z5.s, #16
1720
; SVE-NEXT: lsl z2.s, z2.s, #16
1821
; SVE-NEXT: lsl z1.s, z1.s, #16
19-
; SVE-NEXT: fmul z3.s, z4.s, z3.s
20-
; SVE-NEXT: fmul z1.s, z1.s, z2.s
21-
; SVE-NEXT: bfcvt z2.h, p0/m, z3.s
22-
; SVE-NEXT: uunpkhi z3.s, z0.h
23-
; SVE-NEXT: uunpklo z0.s, z0.h
24-
; SVE-NEXT: bfcvt z1.h, p0/m, z1.s
25-
; SVE-NEXT: lsl z2.s, z2.s, #16
26-
; SVE-NEXT: lsl z3.s, z3.s, #16
2722
; SVE-NEXT: lsl z0.s, z0.s, #16
28-
; SVE-NEXT: lsl z1.s, z1.s, #16
29-
; SVE-NEXT: fadd z2.s, z3.s, z2.s
30-
; SVE-NEXT: fadd z0.s, z0.s, z1.s
31-
; SVE-NEXT: bfcvt z1.h, p0/m, z2.s
23+
; SVE-NEXT: fmad z3.s, p0/m, z4.s, z5.s
24+
; SVE-NEXT: fmla z0.s, p0/m, z1.s, z2.s
25+
; SVE-NEXT: bfcvt z1.h, p0/m, z3.s
3226
; SVE-NEXT: bfcvt z0.h, p0/m, z0.s
3327
; SVE-NEXT: uzp1 z0.h, z0.h, z1.h
3428
; SVE-NEXT: ret
@@ -48,12 +42,9 @@ define <vscale x 4 x bfloat> @fmla_nxv4bf16(<vscale x 4 x bfloat> %acc, <vscale
4842
; SVE: // %bb.0:
4943
; SVE-NEXT: lsl z2.s, z2.s, #16
5044
; SVE-NEXT: lsl z1.s, z1.s, #16
51-
; SVE-NEXT: ptrue p0.s
5245
; SVE-NEXT: lsl z0.s, z0.s, #16
53-
; SVE-NEXT: fmul z1.s, z1.s, z2.s
54-
; SVE-NEXT: bfcvt z1.h, p0/m, z1.s
55-
; SVE-NEXT: lsl z1.s, z1.s, #16
56-
; SVE-NEXT: fadd z0.s, z0.s, z1.s
46+
; SVE-NEXT: ptrue p0.s
47+
; SVE-NEXT: fmla z0.s, p0/m, z1.s, z2.s
5748
; SVE-NEXT: bfcvt z0.h, p0/m, z0.s
5849
; SVE-NEXT: ret
5950
;
@@ -72,12 +63,9 @@ define <vscale x 2 x bfloat> @fmla_nxv2bf16(<vscale x 2 x bfloat> %acc, <vscale
7263
; SVE: // %bb.0:
7364
; SVE-NEXT: lsl z2.s, z2.s, #16
7465
; SVE-NEXT: lsl z1.s, z1.s, #16
75-
; SVE-NEXT: ptrue p0.d
7666
; SVE-NEXT: lsl z0.s, z0.s, #16
77-
; SVE-NEXT: fmul z1.s, p0/m, z1.s, z2.s
78-
; SVE-NEXT: bfcvt z1.h, p0/m, z1.s
79-
; SVE-NEXT: lsl z1.s, z1.s, #16
80-
; SVE-NEXT: fadd z0.s, p0/m, z0.s, z1.s
67+
; SVE-NEXT: ptrue p0.d
68+
; SVE-NEXT: fmla z0.s, p0/m, z1.s, z2.s
8169
; SVE-NEXT: bfcvt z0.h, p0/m, z0.s
8270
; SVE-NEXT: ret
8371
;
@@ -94,28 +82,24 @@ define <vscale x 2 x bfloat> @fmla_nxv2bf16(<vscale x 2 x bfloat> %acc, <vscale
9482
define <vscale x 8 x bfloat> @fmls_nxv8bf16(<vscale x 8 x bfloat> %acc, <vscale x 8 x bfloat> %m1, <vscale x 8 x bfloat> %m2) {
9583
; SVE-LABEL: fmls_nxv8bf16:
9684
; SVE: // %bb.0:
85+
; SVE-NEXT: ptrue p0.h
9786
; SVE-NEXT: uunpkhi z3.s, z2.h
98-
; SVE-NEXT: uunpkhi z4.s, z1.h
87+
; SVE-NEXT: uunpkhi z4.s, z0.h
9988
; SVE-NEXT: uunpklo z2.s, z2.h
100-
; SVE-NEXT: uunpklo z1.s, z1.h
89+
; SVE-NEXT: uunpklo z0.s, z0.h
90+
; SVE-NEXT: fneg z1.h, p0/m, z1.h
10191
; SVE-NEXT: ptrue p0.s
10292
; SVE-NEXT: lsl z3.s, z3.s, #16
10393
; SVE-NEXT: lsl z4.s, z4.s, #16
10494
; SVE-NEXT: lsl z2.s, z2.s, #16
105-
; SVE-NEXT: lsl z1.s, z1.s, #16
106-
; SVE-NEXT: fmul z3.s, z4.s, z3.s
107-
; SVE-NEXT: fmul z1.s, z1.s, z2.s
108-
; SVE-NEXT: bfcvt z2.h, p0/m, z3.s
109-
; SVE-NEXT: uunpkhi z3.s, z0.h
110-
; SVE-NEXT: uunpklo z0.s, z0.h
111-
; SVE-NEXT: bfcvt z1.h, p0/m, z1.s
112-
; SVE-NEXT: lsl z2.s, z2.s, #16
113-
; SVE-NEXT: lsl z3.s, z3.s, #16
11495
; SVE-NEXT: lsl z0.s, z0.s, #16
96+
; SVE-NEXT: uunpkhi z5.s, z1.h
97+
; SVE-NEXT: uunpklo z1.s, z1.h
98+
; SVE-NEXT: lsl z5.s, z5.s, #16
11599
; SVE-NEXT: lsl z1.s, z1.s, #16
116-
; SVE-NEXT: fsub z2.s, z3.s, z2.s
117-
; SVE-NEXT: fsub z0.s, z0.s, z1.s
118-
; SVE-NEXT: bfcvt z1.h, p0/m, z2.s
100+
; SVE-NEXT: fmad z3.s, p0/m, z5.s, z4.s
101+
; SVE-NEXT: fmla z0.s, p0/m, z1.s, z2.s
102+
; SVE-NEXT: bfcvt z1.h, p0/m, z3.s
119103
; SVE-NEXT: bfcvt z0.h, p0/m, z0.s
120104
; SVE-NEXT: uzp1 z0.h, z0.h, z1.h
121105
; SVE-NEXT: ret
@@ -133,14 +117,12 @@ define <vscale x 8 x bfloat> @fmls_nxv8bf16(<vscale x 8 x bfloat> %acc, <vscale
133117
define <vscale x 4 x bfloat> @fmls_nxv4bf16(<vscale x 4 x bfloat> %acc, <vscale x 4 x bfloat> %m1, <vscale x 4 x bfloat> %m2) {
134118
; SVE-LABEL: fmls_nxv4bf16:
135119
; SVE: // %bb.0:
136-
; SVE-NEXT: lsl z2.s, z2.s, #16
137-
; SVE-NEXT: lsl z1.s, z1.s, #16
138120
; SVE-NEXT: ptrue p0.s
121+
; SVE-NEXT: lsl z2.s, z2.s, #16
139122
; SVE-NEXT: lsl z0.s, z0.s, #16
140-
; SVE-NEXT: fmul z1.s, z1.s, z2.s
141-
; SVE-NEXT: bfcvt z1.h, p0/m, z1.s
123+
; SVE-NEXT: fneg z1.h, p0/m, z1.h
142124
; SVE-NEXT: lsl z1.s, z1.s, #16
143-
; SVE-NEXT: fsub z0.s, z0.s, z1.s
125+
; SVE-NEXT: fmla z0.s, p0/m, z1.s, z2.s
144126
; SVE-NEXT: bfcvt z0.h, p0/m, z0.s
145127
; SVE-NEXT: ret
146128
;
@@ -157,14 +139,12 @@ define <vscale x 4 x bfloat> @fmls_nxv4bf16(<vscale x 4 x bfloat> %acc, <vscale
157139
define <vscale x 2 x bfloat> @fmls_nxv2bf16(<vscale x 2 x bfloat> %acc, <vscale x 2 x bfloat> %m1, <vscale x 2 x bfloat> %m2) {
158140
; SVE-LABEL: fmls_nxv2bf16:
159141
; SVE: // %bb.0:
160-
; SVE-NEXT: lsl z2.s, z2.s, #16
161-
; SVE-NEXT: lsl z1.s, z1.s, #16
162142
; SVE-NEXT: ptrue p0.d
143+
; SVE-NEXT: lsl z2.s, z2.s, #16
163144
; SVE-NEXT: lsl z0.s, z0.s, #16
164-
; SVE-NEXT: fmul z1.s, p0/m, z1.s, z2.s
165-
; SVE-NEXT: bfcvt z1.h, p0/m, z1.s
145+
; SVE-NEXT: fneg z1.h, p0/m, z1.h
166146
; SVE-NEXT: lsl z1.s, z1.s, #16
167-
; SVE-NEXT: fsub z0.s, p0/m, z0.s, z1.s
147+
; SVE-NEXT: fmla z0.s, p0/m, z1.s, z2.s
168148
; SVE-NEXT: bfcvt z0.h, p0/m, z0.s
169149
; SVE-NEXT: ret
170150
;
@@ -183,26 +163,20 @@ define <vscale x 8 x bfloat> @fmla_sel_nxv8bf16(<vscale x 8 x i1> %pred, <vscale
183163
; SVE: // %bb.0:
184164
; SVE-NEXT: uunpkhi z3.s, z2.h
185165
; SVE-NEXT: uunpkhi z4.s, z1.h
166+
; SVE-NEXT: uunpkhi z5.s, z0.h
186167
; SVE-NEXT: uunpklo z2.s, z2.h
187168
; SVE-NEXT: uunpklo z1.s, z1.h
169+
; SVE-NEXT: uunpklo z6.s, z0.h
188170
; SVE-NEXT: ptrue p1.s
189171
; SVE-NEXT: lsl z3.s, z3.s, #16
190172
; SVE-NEXT: lsl z4.s, z4.s, #16
173+
; SVE-NEXT: lsl z5.s, z5.s, #16
191174
; SVE-NEXT: lsl z2.s, z2.s, #16
192175
; SVE-NEXT: lsl z1.s, z1.s, #16
193-
; SVE-NEXT: fmul z3.s, z4.s, z3.s
194-
; SVE-NEXT: uunpklo z4.s, z0.h
195-
; SVE-NEXT: fmul z1.s, z1.s, z2.s
176+
; SVE-NEXT: lsl z6.s, z6.s, #16
177+
; SVE-NEXT: fmad z3.s, p1/m, z4.s, z5.s
178+
; SVE-NEXT: fmad z1.s, p1/m, z2.s, z6.s
196179
; SVE-NEXT: bfcvt z2.h, p1/m, z3.s
197-
; SVE-NEXT: uunpkhi z3.s, z0.h
198-
; SVE-NEXT: lsl z4.s, z4.s, #16
199-
; SVE-NEXT: bfcvt z1.h, p1/m, z1.s
200-
; SVE-NEXT: lsl z2.s, z2.s, #16
201-
; SVE-NEXT: lsl z3.s, z3.s, #16
202-
; SVE-NEXT: lsl z1.s, z1.s, #16
203-
; SVE-NEXT: fadd z2.s, z3.s, z2.s
204-
; SVE-NEXT: fadd z1.s, z4.s, z1.s
205-
; SVE-NEXT: bfcvt z2.h, p1/m, z2.s
206180
; SVE-NEXT: bfcvt z1.h, p1/m, z1.s
207181
; SVE-NEXT: uzp1 z1.h, z1.h, z2.h
208182
; SVE-NEXT: mov z0.h, p0/m, z1.h
@@ -223,12 +197,9 @@ define <vscale x 4 x bfloat> @fmla_sel_nxv4bf16(<vscale x 4 x i1> %pred, <vscale
223197
; SVE: // %bb.0:
224198
; SVE-NEXT: lsl z2.s, z2.s, #16
225199
; SVE-NEXT: lsl z1.s, z1.s, #16
200+
; SVE-NEXT: lsl z3.s, z0.s, #16
226201
; SVE-NEXT: ptrue p1.s
227-
; SVE-NEXT: fmul z1.s, z1.s, z2.s
228-
; SVE-NEXT: lsl z2.s, z0.s, #16
229-
; SVE-NEXT: bfcvt z1.h, p1/m, z1.s
230-
; SVE-NEXT: lsl z1.s, z1.s, #16
231-
; SVE-NEXT: fadd z1.s, z2.s, z1.s
202+
; SVE-NEXT: fmad z1.s, p1/m, z2.s, z3.s
232203
; SVE-NEXT: bfcvt z0.h, p0/m, z1.s
233204
; SVE-NEXT: ret
234205
;
@@ -247,12 +218,9 @@ define <vscale x 2 x bfloat> @fmla_sel_nxv2bf16(<vscale x 2 x i1> %pred, <vscale
247218
; SVE: // %bb.0:
248219
; SVE-NEXT: lsl z2.s, z2.s, #16
249220
; SVE-NEXT: lsl z1.s, z1.s, #16
221+
; SVE-NEXT: lsl z3.s, z0.s, #16
250222
; SVE-NEXT: ptrue p1.d
251-
; SVE-NEXT: fmul z1.s, p1/m, z1.s, z2.s
252-
; SVE-NEXT: lsl z2.s, z0.s, #16
253-
; SVE-NEXT: bfcvt z1.h, p1/m, z1.s
254-
; SVE-NEXT: lsl z1.s, z1.s, #16
255-
; SVE-NEXT: fadd z1.s, p1/m, z1.s, z2.s
223+
; SVE-NEXT: fmad z1.s, p1/m, z2.s, z3.s
256224
; SVE-NEXT: bfcvt z0.h, p0/m, z1.s
257225
; SVE-NEXT: ret
258226
;
@@ -269,28 +237,24 @@ define <vscale x 2 x bfloat> @fmla_sel_nxv2bf16(<vscale x 2 x i1> %pred, <vscale
269237
define <vscale x 8 x bfloat> @fmls_sel_nxv8bf16(<vscale x 8 x i1> %pred, <vscale x 8 x bfloat> %acc, <vscale x 8 x bfloat> %m1, <vscale x 8 x bfloat> %m2) {
270238
; SVE-LABEL: fmls_sel_nxv8bf16:
271239
; SVE: // %bb.0:
240+
; SVE-NEXT: ptrue p1.h
272241
; SVE-NEXT: uunpkhi z3.s, z2.h
273-
; SVE-NEXT: uunpkhi z4.s, z1.h
242+
; SVE-NEXT: uunpkhi z4.s, z0.h
274243
; SVE-NEXT: uunpklo z2.s, z2.h
275-
; SVE-NEXT: uunpklo z1.s, z1.h
244+
; SVE-NEXT: uunpklo z6.s, z0.h
245+
; SVE-NEXT: fneg z1.h, p1/m, z1.h
276246
; SVE-NEXT: ptrue p1.s
277247
; SVE-NEXT: lsl z3.s, z3.s, #16
278248
; SVE-NEXT: lsl z4.s, z4.s, #16
279249
; SVE-NEXT: lsl z2.s, z2.s, #16
250+
; SVE-NEXT: lsl z6.s, z6.s, #16
251+
; SVE-NEXT: uunpkhi z5.s, z1.h
252+
; SVE-NEXT: uunpklo z1.s, z1.h
253+
; SVE-NEXT: lsl z5.s, z5.s, #16
280254
; SVE-NEXT: lsl z1.s, z1.s, #16
281-
; SVE-NEXT: fmul z3.s, z4.s, z3.s
282-
; SVE-NEXT: uunpklo z4.s, z0.h
283-
; SVE-NEXT: fmul z1.s, z1.s, z2.s
255+
; SVE-NEXT: fmad z3.s, p1/m, z5.s, z4.s
256+
; SVE-NEXT: fmad z1.s, p1/m, z2.s, z6.s
284257
; SVE-NEXT: bfcvt z2.h, p1/m, z3.s
285-
; SVE-NEXT: uunpkhi z3.s, z0.h
286-
; SVE-NEXT: lsl z4.s, z4.s, #16
287-
; SVE-NEXT: bfcvt z1.h, p1/m, z1.s
288-
; SVE-NEXT: lsl z2.s, z2.s, #16
289-
; SVE-NEXT: lsl z3.s, z3.s, #16
290-
; SVE-NEXT: lsl z1.s, z1.s, #16
291-
; SVE-NEXT: fsub z2.s, z3.s, z2.s
292-
; SVE-NEXT: fsub z1.s, z4.s, z1.s
293-
; SVE-NEXT: bfcvt z2.h, p1/m, z2.s
294258
; SVE-NEXT: bfcvt z1.h, p1/m, z1.s
295259
; SVE-NEXT: uzp1 z1.h, z1.h, z2.h
296260
; SVE-NEXT: mov z0.h, p0/m, z1.h
@@ -309,14 +273,12 @@ define <vscale x 8 x bfloat> @fmls_sel_nxv8bf16(<vscale x 8 x i1> %pred, <vscale
309273
define <vscale x 4 x bfloat> @fmls_sel_nxv4bf16(<vscale x 4 x i1> %pred, <vscale x 4 x bfloat> %acc, <vscale x 4 x bfloat> %m1, <vscale x 4 x bfloat> %m2) {
310274
; SVE-LABEL: fmls_sel_nxv4bf16:
311275
; SVE: // %bb.0:
312-
; SVE-NEXT: lsl z2.s, z2.s, #16
313-
; SVE-NEXT: lsl z1.s, z1.s, #16
314276
; SVE-NEXT: ptrue p1.s
315-
; SVE-NEXT: fmul z1.s, z1.s, z2.s
316-
; SVE-NEXT: lsl z2.s, z0.s, #16
317-
; SVE-NEXT: bfcvt z1.h, p1/m, z1.s
277+
; SVE-NEXT: lsl z2.s, z2.s, #16
278+
; SVE-NEXT: lsl z3.s, z0.s, #16
279+
; SVE-NEXT: fneg z1.h, p1/m, z1.h
318280
; SVE-NEXT: lsl z1.s, z1.s, #16
319-
; SVE-NEXT: fsub z1.s, z2.s, z1.s
281+
; SVE-NEXT: fmad z1.s, p1/m, z2.s, z3.s
320282
; SVE-NEXT: bfcvt z0.h, p0/m, z1.s
321283
; SVE-NEXT: ret
322284
;
@@ -333,14 +295,12 @@ define <vscale x 4 x bfloat> @fmls_sel_nxv4bf16(<vscale x 4 x i1> %pred, <vscale
333295
define <vscale x 2 x bfloat> @fmls_sel_nxv2bf16(<vscale x 2 x i1> %pred, <vscale x 2 x bfloat> %acc, <vscale x 2 x bfloat> %m1, <vscale x 2 x bfloat> %m2) {
334296
; SVE-LABEL: fmls_sel_nxv2bf16:
335297
; SVE: // %bb.0:
336-
; SVE-NEXT: lsl z2.s, z2.s, #16
337-
; SVE-NEXT: lsl z1.s, z1.s, #16
338298
; SVE-NEXT: ptrue p1.d
339-
; SVE-NEXT: fmul z1.s, p1/m, z1.s, z2.s
340-
; SVE-NEXT: lsl z2.s, z0.s, #16
341-
; SVE-NEXT: bfcvt z1.h, p1/m, z1.s
299+
; SVE-NEXT: lsl z2.s, z2.s, #16
300+
; SVE-NEXT: lsl z3.s, z0.s, #16
301+
; SVE-NEXT: fneg z1.h, p1/m, z1.h
342302
; SVE-NEXT: lsl z1.s, z1.s, #16
343-
; SVE-NEXT: fsubr z1.s, p1/m, z1.s, z2.s
303+
; SVE-NEXT: fmad z1.s, p1/m, z2.s, z3.s
344304
; SVE-NEXT: bfcvt z0.h, p0/m, z1.s
345305
; SVE-NEXT: ret
346306
;

0 commit comments

Comments
 (0)