Skip to content

Commit 46e9d63

Browse files
[LLVM][CodeGen][SVE] Use BFMLALB for promoted bfloat fma operations. (#167340)
We're likely to get better code from custom legalisation, where we can remove unpack instructions (plus SVE2p1 has BFMLSLB/T), but we get much of benefit with these two small changes. NOTE: LLVM has no support for FEAT_AFP in terms of feature detection or ACLE builtins, so the compiler works under the assumption the feature is not enabled. Patch is also more aggressive when enabling bfloat fma construction because it removes unnecessary rounding which is generally preferable regardless of whether BFMLALB is used or not.
1 parent 02c68b3 commit 46e9d63

File tree

4 files changed

+616
-145
lines changed

4 files changed

+616
-145
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18555,7 +18555,7 @@ bool AArch64TargetLowering::isFMAFasterThanFMulAndFAdd(
1855518555
case MVT::f64:
1855618556
return true;
1855718557
case MVT::bf16:
18558-
return VT.isScalableVector() && Subtarget->hasSVEB16B16() &&
18558+
return VT.isScalableVector() && Subtarget->hasBF16() &&
1855918559
Subtarget->isNonStreamingSVEorSME2Available();
1856018560
default:
1856118561
break;

llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2583,6 +2583,11 @@ let Predicates = [HasBF16, HasSVE_or_SME] in {
25832583
defm BFMLALB_ZZZI : sve2_fp_mla_long_by_indexed_elem<0b100, "bfmlalb", nxv4f32, nxv8bf16, int_aarch64_sve_bfmlalb_lane_v2>;
25842584
defm BFMLALT_ZZZI : sve2_fp_mla_long_by_indexed_elem<0b101, "bfmlalt", nxv4f32, nxv8bf16, int_aarch64_sve_bfmlalt_lane_v2>;
25852585

2586+
def : Pat<(nxv4f32 (AArch64fmla_p (SVEAllActive), nxv4f32:$acc,
2587+
(nxv4f32 (AArch64fcvte_mt (SVEAllActive), nxv4bf16:$Zn, (undef))),
2588+
(nxv4f32 (AArch64fcvte_mt (SVEAllActive), nxv4bf16:$Zm, (undef))))),
2589+
(BFMLALB_ZZZ nxv4f32:$acc, ZPR:$Zn, ZPR:$Zm)>;
2590+
25862591
defm BFCVT_ZPmZ : sve_bfloat_convert<"bfcvt", int_aarch64_sve_fcvt_bf16f32_v2, AArch64fcvtr_mt>;
25872592
defm BFCVTNT_ZPmZ : sve_bfloat_convert_top<"bfcvtnt", int_aarch64_sve_fcvtnt_bf16f32_v2>;
25882593
} // End HasBF16, HasSVE_or_SME

llvm/test/CodeGen/AArch64/sve-bf16-arith.ll

Lines changed: 11 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -466,12 +466,10 @@ define <vscale x 2 x bfloat> @fmla_nxv2bf16(<vscale x 2 x bfloat> %a, <vscale x
466466
define <vscale x 4 x bfloat> @fmla_nxv4bf16(<vscale x 4 x bfloat> %a, <vscale x 4 x bfloat> %b, <vscale x 4 x bfloat> %c) {
467467
; NOB16B16-LABEL: fmla_nxv4bf16:
468468
; NOB16B16: // %bb.0:
469-
; NOB16B16-NEXT: lsl z1.s, z1.s, #16
470-
; NOB16B16-NEXT: lsl z0.s, z0.s, #16
471469
; NOB16B16-NEXT: lsl z2.s, z2.s, #16
472470
; NOB16B16-NEXT: ptrue p0.s
473-
; NOB16B16-NEXT: fmad z0.s, p0/m, z1.s, z2.s
474-
; NOB16B16-NEXT: bfcvt z0.h, p0/m, z0.s
471+
; NOB16B16-NEXT: bfmlalb z2.s, z0.h, z1.h
472+
; NOB16B16-NEXT: bfcvt z0.h, p0/m, z2.s
475473
; NOB16B16-NEXT: ret
476474
;
477475
; B16B16-LABEL: fmla_nxv4bf16:
@@ -486,24 +484,20 @@ define <vscale x 4 x bfloat> @fmla_nxv4bf16(<vscale x 4 x bfloat> %a, <vscale x
486484
define <vscale x 8 x bfloat> @fmla_nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) {
487485
; NOB16B16-LABEL: fmla_nxv8bf16:
488486
; NOB16B16: // %bb.0:
489-
; NOB16B16-NEXT: uunpkhi z3.s, z1.h
490-
; NOB16B16-NEXT: uunpkhi z4.s, z0.h
491-
; NOB16B16-NEXT: uunpkhi z5.s, z2.h
487+
; NOB16B16-NEXT: uunpkhi z3.s, z2.h
488+
; NOB16B16-NEXT: uunpklo z2.s, z2.h
489+
; NOB16B16-NEXT: uunpkhi z4.s, z1.h
490+
; NOB16B16-NEXT: uunpkhi z5.s, z0.h
492491
; NOB16B16-NEXT: uunpklo z1.s, z1.h
493492
; NOB16B16-NEXT: uunpklo z0.s, z0.h
494-
; NOB16B16-NEXT: uunpklo z2.s, z2.h
495493
; NOB16B16-NEXT: ptrue p0.s
496494
; NOB16B16-NEXT: lsl z3.s, z3.s, #16
497-
; NOB16B16-NEXT: lsl z4.s, z4.s, #16
498-
; NOB16B16-NEXT: lsl z5.s, z5.s, #16
499-
; NOB16B16-NEXT: lsl z1.s, z1.s, #16
500-
; NOB16B16-NEXT: lsl z0.s, z0.s, #16
501495
; NOB16B16-NEXT: lsl z2.s, z2.s, #16
502-
; NOB16B16-NEXT: fmad z3.s, p0/m, z4.s, z5.s
503-
; NOB16B16-NEXT: fmad z0.s, p0/m, z1.s, z2.s
504-
; NOB16B16-NEXT: bfcvt z1.h, p0/m, z3.s
505-
; NOB16B16-NEXT: bfcvt z0.h, p0/m, z0.s
506-
; NOB16B16-NEXT: uzp1 z0.h, z0.h, z1.h
496+
; NOB16B16-NEXT: bfmlalb z3.s, z5.h, z4.h
497+
; NOB16B16-NEXT: bfmlalb z2.s, z0.h, z1.h
498+
; NOB16B16-NEXT: bfcvt z0.h, p0/m, z3.s
499+
; NOB16B16-NEXT: bfcvt z1.h, p0/m, z2.s
500+
; NOB16B16-NEXT: uzp1 z0.h, z1.h, z0.h
507501
; NOB16B16-NEXT: ret
508502
;
509503
; B16B16-LABEL: fmla_nxv8bf16:

0 commit comments

Comments
 (0)