Skip to content

Commit 9886945

Browse files
[LLVM][CodeGen][SVE] Use BFMLALB for promoted bfloat fma operations.
NOTE: From what I can see LLVM has no support for FEAT_AFP in terms of feature detection or ACLE builtins and so I believe the compiler can (and does) work under the assumption the feature is not enabled.
1 parent e0fd9b9 commit 9886945

File tree

3 files changed

+50
-77
lines changed

3 files changed

+50
-77
lines changed

llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2578,6 +2578,10 @@ let Predicates = [HasBF16, HasSVE_or_SME] in {
25782578
defm BFMLALB_ZZZI : sve2_fp_mla_long_by_indexed_elem<0b100, "bfmlalb", nxv4f32, nxv8bf16, int_aarch64_sve_bfmlalb_lane_v2>;
25792579
defm BFMLALT_ZZZI : sve2_fp_mla_long_by_indexed_elem<0b101, "bfmlalt", nxv4f32, nxv8bf16, int_aarch64_sve_bfmlalt_lane_v2>;
25802580

2581+
def : Pat<(nxv4f32 (AArch64fmla_p (SVEAllActive), nxv4f32:$acc, (nxv4f32 (AArch64fcvte_mt (SVEAllActive), nxv4bf16:$Zn, (undef))),
2582+
(nxv4f32 (AArch64fcvte_mt (SVEAllActive), nxv4bf16:$Zm, (undef))))),
2583+
(BFMLALB_ZZZ nxv4f32:$acc, ZPR:$Zn, ZPR:$Zm)>;
2584+
25812585
defm BFCVT_ZPmZ : sve_bfloat_convert<"bfcvt", int_aarch64_sve_fcvt_bf16f32_v2, AArch64fcvtr_mt>;
25822586
defm BFCVTNT_ZPmZ : sve_bfloat_convert_top<"bfcvtnt", int_aarch64_sve_fcvtnt_bf16f32_v2>;
25832587
} // End HasBF16, HasSVE_or_SME

llvm/test/CodeGen/AArch64/sve-bf16-arith.ll

Lines changed: 11 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -466,12 +466,10 @@ define <vscale x 2 x bfloat> @fmla_nxv2bf16(<vscale x 2 x bfloat> %a, <vscale x
466466
define <vscale x 4 x bfloat> @fmla_nxv4bf16(<vscale x 4 x bfloat> %a, <vscale x 4 x bfloat> %b, <vscale x 4 x bfloat> %c) {
467467
; NOB16B16-LABEL: fmla_nxv4bf16:
468468
; NOB16B16: // %bb.0:
469-
; NOB16B16-NEXT: lsl z1.s, z1.s, #16
470-
; NOB16B16-NEXT: lsl z0.s, z0.s, #16
471469
; NOB16B16-NEXT: lsl z2.s, z2.s, #16
472470
; NOB16B16-NEXT: ptrue p0.s
473-
; NOB16B16-NEXT: fmad z0.s, p0/m, z1.s, z2.s
474-
; NOB16B16-NEXT: bfcvt z0.h, p0/m, z0.s
471+
; NOB16B16-NEXT: bfmlalb z2.s, z0.h, z1.h
472+
; NOB16B16-NEXT: bfcvt z0.h, p0/m, z2.s
475473
; NOB16B16-NEXT: ret
476474
;
477475
; B16B16-LABEL: fmla_nxv4bf16:
@@ -486,24 +484,20 @@ define <vscale x 4 x bfloat> @fmla_nxv4bf16(<vscale x 4 x bfloat> %a, <vscale x
486484
define <vscale x 8 x bfloat> @fmla_nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) {
487485
; NOB16B16-LABEL: fmla_nxv8bf16:
488486
; NOB16B16: // %bb.0:
489-
; NOB16B16-NEXT: uunpkhi z3.s, z1.h
490-
; NOB16B16-NEXT: uunpkhi z4.s, z0.h
491-
; NOB16B16-NEXT: uunpkhi z5.s, z2.h
487+
; NOB16B16-NEXT: uunpkhi z3.s, z2.h
488+
; NOB16B16-NEXT: uunpklo z2.s, z2.h
489+
; NOB16B16-NEXT: uunpkhi z4.s, z1.h
490+
; NOB16B16-NEXT: uunpkhi z5.s, z0.h
492491
; NOB16B16-NEXT: uunpklo z1.s, z1.h
493492
; NOB16B16-NEXT: uunpklo z0.s, z0.h
494-
; NOB16B16-NEXT: uunpklo z2.s, z2.h
495493
; NOB16B16-NEXT: ptrue p0.s
496494
; NOB16B16-NEXT: lsl z3.s, z3.s, #16
497-
; NOB16B16-NEXT: lsl z4.s, z4.s, #16
498-
; NOB16B16-NEXT: lsl z5.s, z5.s, #16
499-
; NOB16B16-NEXT: lsl z1.s, z1.s, #16
500-
; NOB16B16-NEXT: lsl z0.s, z0.s, #16
501495
; NOB16B16-NEXT: lsl z2.s, z2.s, #16
502-
; NOB16B16-NEXT: fmad z3.s, p0/m, z4.s, z5.s
503-
; NOB16B16-NEXT: fmad z0.s, p0/m, z1.s, z2.s
504-
; NOB16B16-NEXT: bfcvt z1.h, p0/m, z3.s
505-
; NOB16B16-NEXT: bfcvt z0.h, p0/m, z0.s
506-
; NOB16B16-NEXT: uzp1 z0.h, z0.h, z1.h
496+
; NOB16B16-NEXT: bfmlalb z3.s, z5.h, z4.h
497+
; NOB16B16-NEXT: bfmlalb z2.s, z0.h, z1.h
498+
; NOB16B16-NEXT: bfcvt z0.h, p0/m, z3.s
499+
; NOB16B16-NEXT: bfcvt z1.h, p0/m, z2.s
500+
; NOB16B16-NEXT: uzp1 z0.h, z1.h, z0.h
507501
; NOB16B16-NEXT: ret
508502
;
509503
; B16B16-LABEL: fmla_nxv8bf16:

llvm/test/CodeGen/AArch64/sve-bf16-combines.ll

Lines changed: 35 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -7,21 +7,17 @@ target triple = "aarch64-unknown-linux-gnu"
77
define <vscale x 8 x bfloat> @fmla_nxv8bf16(<vscale x 8 x bfloat> %acc, <vscale x 8 x bfloat> %m1, <vscale x 8 x bfloat> %m2) {
88
; SVE-LABEL: fmla_nxv8bf16:
99
; SVE: // %bb.0:
10-
; SVE-NEXT: uunpkhi z3.s, z2.h
11-
; SVE-NEXT: uunpkhi z4.s, z1.h
12-
; SVE-NEXT: uunpkhi z5.s, z0.h
10+
; SVE-NEXT: uunpkhi z3.s, z0.h
11+
; SVE-NEXT: uunpklo z0.s, z0.h
12+
; SVE-NEXT: uunpkhi z4.s, z2.h
13+
; SVE-NEXT: uunpkhi z5.s, z1.h
1314
; SVE-NEXT: uunpklo z2.s, z2.h
1415
; SVE-NEXT: uunpklo z1.s, z1.h
15-
; SVE-NEXT: uunpklo z0.s, z0.h
1616
; SVE-NEXT: ptrue p0.s
1717
; SVE-NEXT: lsl z3.s, z3.s, #16
18-
; SVE-NEXT: lsl z4.s, z4.s, #16
19-
; SVE-NEXT: lsl z5.s, z5.s, #16
20-
; SVE-NEXT: lsl z2.s, z2.s, #16
21-
; SVE-NEXT: lsl z1.s, z1.s, #16
2218
; SVE-NEXT: lsl z0.s, z0.s, #16
23-
; SVE-NEXT: fmad z3.s, p0/m, z4.s, z5.s
24-
; SVE-NEXT: fmla z0.s, p0/m, z1.s, z2.s
19+
; SVE-NEXT: bfmlalb z3.s, z5.h, z4.h
20+
; SVE-NEXT: bfmlalb z0.s, z1.h, z2.h
2521
; SVE-NEXT: bfcvt z1.h, p0/m, z3.s
2622
; SVE-NEXT: bfcvt z0.h, p0/m, z0.s
2723
; SVE-NEXT: uzp1 z0.h, z0.h, z1.h
@@ -40,11 +36,9 @@ define <vscale x 8 x bfloat> @fmla_nxv8bf16(<vscale x 8 x bfloat> %acc, <vscale
4036
define <vscale x 4 x bfloat> @fmla_nxv4bf16(<vscale x 4 x bfloat> %acc, <vscale x 4 x bfloat> %m1, <vscale x 4 x bfloat> %m2) {
4137
; SVE-LABEL: fmla_nxv4bf16:
4238
; SVE: // %bb.0:
43-
; SVE-NEXT: lsl z2.s, z2.s, #16
44-
; SVE-NEXT: lsl z1.s, z1.s, #16
4539
; SVE-NEXT: lsl z0.s, z0.s, #16
4640
; SVE-NEXT: ptrue p0.s
47-
; SVE-NEXT: fmla z0.s, p0/m, z1.s, z2.s
41+
; SVE-NEXT: bfmlalb z0.s, z1.h, z2.h
4842
; SVE-NEXT: bfcvt z0.h, p0/m, z0.s
4943
; SVE-NEXT: ret
5044
;
@@ -83,22 +77,18 @@ define <vscale x 8 x bfloat> @fmls_nxv8bf16(<vscale x 8 x bfloat> %acc, <vscale
8377
; SVE-LABEL: fmls_nxv8bf16:
8478
; SVE: // %bb.0:
8579
; SVE-NEXT: ptrue p0.h
86-
; SVE-NEXT: uunpkhi z3.s, z2.h
87-
; SVE-NEXT: uunpkhi z4.s, z0.h
88-
; SVE-NEXT: uunpklo z2.s, z2.h
80+
; SVE-NEXT: uunpkhi z3.s, z0.h
8981
; SVE-NEXT: uunpklo z0.s, z0.h
82+
; SVE-NEXT: uunpkhi z5.s, z2.h
83+
; SVE-NEXT: uunpklo z2.s, z2.h
9084
; SVE-NEXT: fneg z1.h, p0/m, z1.h
9185
; SVE-NEXT: ptrue p0.s
9286
; SVE-NEXT: lsl z3.s, z3.s, #16
93-
; SVE-NEXT: lsl z4.s, z4.s, #16
94-
; SVE-NEXT: lsl z2.s, z2.s, #16
9587
; SVE-NEXT: lsl z0.s, z0.s, #16
96-
; SVE-NEXT: uunpkhi z5.s, z1.h
88+
; SVE-NEXT: uunpkhi z4.s, z1.h
9789
; SVE-NEXT: uunpklo z1.s, z1.h
98-
; SVE-NEXT: lsl z5.s, z5.s, #16
99-
; SVE-NEXT: lsl z1.s, z1.s, #16
100-
; SVE-NEXT: fmad z3.s, p0/m, z5.s, z4.s
101-
; SVE-NEXT: fmla z0.s, p0/m, z1.s, z2.s
90+
; SVE-NEXT: bfmlalb z3.s, z4.h, z5.h
91+
; SVE-NEXT: bfmlalb z0.s, z1.h, z2.h
10292
; SVE-NEXT: bfcvt z1.h, p0/m, z3.s
10393
; SVE-NEXT: bfcvt z0.h, p0/m, z0.s
10494
; SVE-NEXT: uzp1 z0.h, z0.h, z1.h
@@ -118,11 +108,9 @@ define <vscale x 4 x bfloat> @fmls_nxv4bf16(<vscale x 4 x bfloat> %acc, <vscale
118108
; SVE-LABEL: fmls_nxv4bf16:
119109
; SVE: // %bb.0:
120110
; SVE-NEXT: ptrue p0.s
121-
; SVE-NEXT: lsl z2.s, z2.s, #16
122111
; SVE-NEXT: lsl z0.s, z0.s, #16
123112
; SVE-NEXT: fneg z1.h, p0/m, z1.h
124-
; SVE-NEXT: lsl z1.s, z1.s, #16
125-
; SVE-NEXT: fmla z0.s, p0/m, z1.s, z2.s
113+
; SVE-NEXT: bfmlalb z0.s, z1.h, z2.h
126114
; SVE-NEXT: bfcvt z0.h, p0/m, z0.s
127115
; SVE-NEXT: ret
128116
;
@@ -161,24 +149,20 @@ define <vscale x 2 x bfloat> @fmls_nxv2bf16(<vscale x 2 x bfloat> %acc, <vscale
161149
define <vscale x 8 x bfloat> @fmla_sel_nxv8bf16(<vscale x 8 x i1> %pred, <vscale x 8 x bfloat> %acc, <vscale x 8 x bfloat> %m1, <vscale x 8 x bfloat> %m2) {
162150
; SVE-LABEL: fmla_sel_nxv8bf16:
163151
; SVE: // %bb.0:
164-
; SVE-NEXT: uunpkhi z3.s, z2.h
165-
; SVE-NEXT: uunpkhi z4.s, z1.h
166-
; SVE-NEXT: uunpkhi z5.s, z0.h
152+
; SVE-NEXT: uunpkhi z3.s, z0.h
153+
; SVE-NEXT: uunpklo z4.s, z0.h
154+
; SVE-NEXT: uunpkhi z5.s, z2.h
155+
; SVE-NEXT: uunpkhi z6.s, z1.h
167156
; SVE-NEXT: uunpklo z2.s, z2.h
168157
; SVE-NEXT: uunpklo z1.s, z1.h
169-
; SVE-NEXT: uunpklo z6.s, z0.h
170158
; SVE-NEXT: ptrue p1.s
171159
; SVE-NEXT: lsl z3.s, z3.s, #16
172160
; SVE-NEXT: lsl z4.s, z4.s, #16
173-
; SVE-NEXT: lsl z5.s, z5.s, #16
174-
; SVE-NEXT: lsl z2.s, z2.s, #16
175-
; SVE-NEXT: lsl z1.s, z1.s, #16
176-
; SVE-NEXT: lsl z6.s, z6.s, #16
177-
; SVE-NEXT: fmad z3.s, p1/m, z4.s, z5.s
178-
; SVE-NEXT: fmad z1.s, p1/m, z2.s, z6.s
179-
; SVE-NEXT: bfcvt z2.h, p1/m, z3.s
180-
; SVE-NEXT: bfcvt z1.h, p1/m, z1.s
181-
; SVE-NEXT: uzp1 z1.h, z1.h, z2.h
161+
; SVE-NEXT: bfmlalb z3.s, z6.h, z5.h
162+
; SVE-NEXT: bfmlalb z4.s, z1.h, z2.h
163+
; SVE-NEXT: bfcvt z1.h, p1/m, z3.s
164+
; SVE-NEXT: bfcvt z2.h, p1/m, z4.s
165+
; SVE-NEXT: uzp1 z1.h, z2.h, z1.h
182166
; SVE-NEXT: mov z0.h, p0/m, z1.h
183167
; SVE-NEXT: ret
184168
;
@@ -195,12 +179,9 @@ define <vscale x 8 x bfloat> @fmla_sel_nxv8bf16(<vscale x 8 x i1> %pred, <vscale
195179
define <vscale x 4 x bfloat> @fmla_sel_nxv4bf16(<vscale x 4 x i1> %pred, <vscale x 4 x bfloat> %acc, <vscale x 4 x bfloat> %m1, <vscale x 4 x bfloat> %m2) {
196180
; SVE-LABEL: fmla_sel_nxv4bf16:
197181
; SVE: // %bb.0:
198-
; SVE-NEXT: lsl z2.s, z2.s, #16
199-
; SVE-NEXT: lsl z1.s, z1.s, #16
200182
; SVE-NEXT: lsl z3.s, z0.s, #16
201-
; SVE-NEXT: ptrue p1.s
202-
; SVE-NEXT: fmad z1.s, p1/m, z2.s, z3.s
203-
; SVE-NEXT: bfcvt z0.h, p0/m, z1.s
183+
; SVE-NEXT: bfmlalb z3.s, z1.h, z2.h
184+
; SVE-NEXT: bfcvt z0.h, p0/m, z3.s
204185
; SVE-NEXT: ret
205186
;
206187
; SVE-B16B16-LABEL: fmla_sel_nxv4bf16:
@@ -238,25 +219,21 @@ define <vscale x 8 x bfloat> @fmls_sel_nxv8bf16(<vscale x 8 x i1> %pred, <vscale
238219
; SVE-LABEL: fmls_sel_nxv8bf16:
239220
; SVE: // %bb.0:
240221
; SVE-NEXT: ptrue p1.h
241-
; SVE-NEXT: uunpkhi z3.s, z2.h
242-
; SVE-NEXT: uunpkhi z4.s, z0.h
222+
; SVE-NEXT: uunpkhi z3.s, z0.h
223+
; SVE-NEXT: uunpklo z4.s, z0.h
224+
; SVE-NEXT: uunpkhi z6.s, z2.h
243225
; SVE-NEXT: uunpklo z2.s, z2.h
244-
; SVE-NEXT: uunpklo z6.s, z0.h
245226
; SVE-NEXT: fneg z1.h, p1/m, z1.h
246227
; SVE-NEXT: ptrue p1.s
247228
; SVE-NEXT: lsl z3.s, z3.s, #16
248229
; SVE-NEXT: lsl z4.s, z4.s, #16
249-
; SVE-NEXT: lsl z2.s, z2.s, #16
250-
; SVE-NEXT: lsl z6.s, z6.s, #16
251230
; SVE-NEXT: uunpkhi z5.s, z1.h
252231
; SVE-NEXT: uunpklo z1.s, z1.h
253-
; SVE-NEXT: lsl z5.s, z5.s, #16
254-
; SVE-NEXT: lsl z1.s, z1.s, #16
255-
; SVE-NEXT: fmad z3.s, p1/m, z5.s, z4.s
256-
; SVE-NEXT: fmad z1.s, p1/m, z2.s, z6.s
257-
; SVE-NEXT: bfcvt z2.h, p1/m, z3.s
258-
; SVE-NEXT: bfcvt z1.h, p1/m, z1.s
259-
; SVE-NEXT: uzp1 z1.h, z1.h, z2.h
232+
; SVE-NEXT: bfmlalb z3.s, z5.h, z6.h
233+
; SVE-NEXT: bfmlalb z4.s, z1.h, z2.h
234+
; SVE-NEXT: bfcvt z1.h, p1/m, z3.s
235+
; SVE-NEXT: bfcvt z2.h, p1/m, z4.s
236+
; SVE-NEXT: uzp1 z1.h, z2.h, z1.h
260237
; SVE-NEXT: mov z0.h, p0/m, z1.h
261238
; SVE-NEXT: ret
262239
;
@@ -274,12 +251,10 @@ define <vscale x 4 x bfloat> @fmls_sel_nxv4bf16(<vscale x 4 x i1> %pred, <vscale
274251
; SVE-LABEL: fmls_sel_nxv4bf16:
275252
; SVE: // %bb.0:
276253
; SVE-NEXT: ptrue p1.s
277-
; SVE-NEXT: lsl z2.s, z2.s, #16
278254
; SVE-NEXT: lsl z3.s, z0.s, #16
279255
; SVE-NEXT: fneg z1.h, p1/m, z1.h
280-
; SVE-NEXT: lsl z1.s, z1.s, #16
281-
; SVE-NEXT: fmad z1.s, p1/m, z2.s, z3.s
282-
; SVE-NEXT: bfcvt z0.h, p0/m, z1.s
256+
; SVE-NEXT: bfmlalb z3.s, z1.h, z2.h
257+
; SVE-NEXT: bfcvt z0.h, p0/m, z3.s
283258
; SVE-NEXT: ret
284259
;
285260
; SVE-B16B16-LABEL: fmls_sel_nxv4bf16:

0 commit comments

Comments
 (0)