Skip to content

Commit 6a9971d

Browse files
committed
[AMDGPU] Enable CodeGen for v_pk_fma_bf16
1 parent f44daa1 commit 6a9971d

File tree

3 files changed

+392
-791
lines changed

3 files changed

+392
-791
lines changed

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6106,6 +6106,7 @@ bool SITargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
61066106
case MVT::f64:
61076107
return true;
61086108
case MVT::f16:
6109+
case MVT::bf16:
61096110
return Subtarget->has16BitInsts() && !denormalModeIsFlushAllF64F16(MF);
61106111
default:
61116112
break;

llvm/test/CodeGen/AMDGPU/bf16-math.ll

Lines changed: 29 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -370,6 +370,9 @@ define amdgpu_ps bfloat @test_clamp_bf16_folding(bfloat %src) {
370370
; GCN: ; %bb.0:
371371
; GCN-NEXT: v_exp_bf16_e64 v0, v0 clamp
372372
; GCN-NEXT: ; return to shader part epilog
373+
374+
375+
373376
%exp = call bfloat @llvm.exp2.bf16(bfloat %src)
374377
%max = call bfloat @llvm.maxnum.bf16(bfloat %exp, bfloat 0.0)
375378
%clamp = call bfloat @llvm.minnum.bf16(bfloat %max, bfloat 1.0)
@@ -381,6 +384,9 @@ define amdgpu_ps float @test_clamp_v2bf16_folding(<2 x bfloat> %src0, <2 x bfloa
381384
; GCN: ; %bb.0:
382385
; GCN-NEXT: v_pk_mul_bf16 v0, v0, v1 clamp
383386
; GCN-NEXT: ; return to shader part epilog
387+
388+
389+
384390
%mul = fmul <2 x bfloat> %src0, %src1
385391
%max = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> %mul, <2 x bfloat> <bfloat 0.0, bfloat 0.0>)
386392
%clamp = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> %max, <2 x bfloat> <bfloat 1.0, bfloat 1.0>)
@@ -391,11 +397,12 @@ define amdgpu_ps float @test_clamp_v2bf16_folding(<2 x bfloat> %src0, <2 x bfloa
391397
define amdgpu_ps void @v_test_mul_add_v2bf16_vvv(ptr addrspace(1) %out, <2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) {
392398
; GCN-LABEL: v_test_mul_add_v2bf16_vvv:
393399
; GCN: ; %bb.0:
394-
; GCN-NEXT: v_pk_mul_bf16 v2, v2, v3
395-
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
396-
; GCN-NEXT: v_pk_add_bf16 v2, v2, v4
400+
; GCN-NEXT: v_pk_fma_bf16 v2, v2, v3, v4
397401
; GCN-NEXT: global_store_b32 v[0:1], v2, off
398402
; GCN-NEXT: s_endpgm
403+
404+
405+
399406
%mul = fmul contract <2 x bfloat> %a, %b
400407
%add = fadd contract <2 x bfloat> %mul, %c
401408
store <2 x bfloat> %add, ptr addrspace(1) %out
@@ -405,11 +412,12 @@ define amdgpu_ps void @v_test_mul_add_v2bf16_vvv(ptr addrspace(1) %out, <2 x bfl
405412
define amdgpu_ps void @v_test_mul_add_v2bf16_vss(ptr addrspace(1) %out, <2 x bfloat> %a, <2 x bfloat> inreg %b, <2 x bfloat> inreg %c) {
406413
; GCN-LABEL: v_test_mul_add_v2bf16_vss:
407414
; GCN: ; %bb.0:
408-
; GCN-NEXT: v_pk_mul_bf16 v2, v2, s0
409-
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
410-
; GCN-NEXT: v_pk_add_bf16 v2, v2, s1
415+
; GCN-NEXT: v_pk_fma_bf16 v2, v2, s0, s1
411416
; GCN-NEXT: global_store_b32 v[0:1], v2, off
412417
; GCN-NEXT: s_endpgm
418+
419+
420+
413421
%mul = fmul contract <2 x bfloat> %a, %b
414422
%add = fadd contract <2 x bfloat> %mul, %c
415423
store <2 x bfloat> %add, ptr addrspace(1) %out
@@ -419,11 +427,14 @@ define amdgpu_ps void @v_test_mul_add_v2bf16_vss(ptr addrspace(1) %out, <2 x bfl
419427
define amdgpu_ps void @v_test_mul_add_v2bf16_sss(ptr addrspace(1) %out, <2 x bfloat> inreg %a, <2 x bfloat> inreg %b, <2 x bfloat> inreg %c) {
420428
; GCN-LABEL: v_test_mul_add_v2bf16_sss:
421429
; GCN: ; %bb.0:
422-
; GCN-NEXT: v_pk_mul_bf16 v2, s0, s1
430+
; GCN-NEXT: v_mov_b32_e32 v2, s2
423431
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
424-
; GCN-NEXT: v_pk_add_bf16 v2, v2, s2
432+
; GCN-NEXT: v_pk_fma_bf16 v2, s0, s1, v2
425433
; GCN-NEXT: global_store_b32 v[0:1], v2, off
426434
; GCN-NEXT: s_endpgm
435+
436+
437+
427438
%mul = fmul contract <2 x bfloat> %a, %b
428439
%add = fadd contract <2 x bfloat> %mul, %c
429440
store <2 x bfloat> %add, ptr addrspace(1) %out
@@ -433,11 +444,12 @@ define amdgpu_ps void @v_test_mul_add_v2bf16_sss(ptr addrspace(1) %out, <2 x bfl
433444
define amdgpu_ps void @v_test_mul_add_v2bf16_vsc(ptr addrspace(1) %out, <2 x bfloat> %a, <2 x bfloat> inreg %b) {
434445
; GCN-LABEL: v_test_mul_add_v2bf16_vsc:
435446
; GCN: ; %bb.0:
436-
; GCN-NEXT: v_pk_mul_bf16 v2, v2, s0
437-
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
438-
; GCN-NEXT: v_pk_add_bf16 v2, v2, 0.5 op_sel_hi:[1,0]
447+
; GCN-NEXT: v_pk_fma_bf16 v2, v2, s0, 0.5 op_sel_hi:[1,1,0]
439448
; GCN-NEXT: global_store_b32 v[0:1], v2, off
440449
; GCN-NEXT: s_endpgm
450+
451+
452+
441453
%mul = fmul contract <2 x bfloat> %a, %b
442454
%add = fadd contract <2 x bfloat> %mul, <bfloat 0.5, bfloat 0.5>
443455
store <2 x bfloat> %add, ptr addrspace(1) %out
@@ -447,11 +459,14 @@ define amdgpu_ps void @v_test_mul_add_v2bf16_vsc(ptr addrspace(1) %out, <2 x bfl
447459
define amdgpu_ps void @v_test_mul_add_v2bf16_vll(ptr addrspace(1) %out, <2 x bfloat> %a) {
448460
; GCN-LABEL: v_test_mul_add_v2bf16_vll:
449461
; GCN: ; %bb.0:
450-
; GCN-NEXT: v_pk_mul_bf16 v2, 0x42c83f80, v2
451-
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
452-
; GCN-NEXT: v_pk_add_bf16 v2, 0x43484000, v2
462+
; GCN-NEXT: s_mov_b32 s0, 0x43484000
463+
; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
464+
; GCN-NEXT: v_pk_fma_bf16 v2, 0x42c83f80, v2, s0
453465
; GCN-NEXT: global_store_b32 v[0:1], v2, off
454466
; GCN-NEXT: s_endpgm
467+
468+
469+
455470
%mul = fmul contract <2 x bfloat> %a, <bfloat 1.0, bfloat 100.0>
456471
%add = fadd contract <2 x bfloat> %mul, <bfloat 2.0, bfloat 200.0>
457472
store <2 x bfloat> %add, ptr addrspace(1) %out

0 commit comments

Comments
 (0)