@@ -370,6 +370,9 @@ define amdgpu_ps bfloat @test_clamp_bf16_folding(bfloat %src) {
370370; GCN: ; %bb.0:
371371; GCN-NEXT: v_exp_bf16_e64 v0, v0 clamp
372372; GCN-NEXT: ; return to shader part epilog
373+
374+
375+
373376 %exp = call bfloat @llvm.exp2.bf16 (bfloat %src )
374377 %max = call bfloat @llvm.maxnum.bf16 (bfloat %exp , bfloat 0 .0 )
375378 %clamp = call bfloat @llvm.minnum.bf16 (bfloat %max , bfloat 1 .0 )
@@ -381,6 +384,9 @@ define amdgpu_ps float @test_clamp_v2bf16_folding(<2 x bfloat> %src0, <2 x bfloa
381384; GCN: ; %bb.0:
382385; GCN-NEXT: v_pk_mul_bf16 v0, v0, v1 clamp
383386; GCN-NEXT: ; return to shader part epilog
387+
388+
389+
384390 %mul = fmul <2 x bfloat> %src0 , %src1
385391 %max = call <2 x bfloat> @llvm.maxnum.v2bf16 (<2 x bfloat> %mul , <2 x bfloat> <bfloat 0 .0 , bfloat 0 .0 >)
386392 %clamp = call <2 x bfloat> @llvm.minnum.v2bf16 (<2 x bfloat> %max , <2 x bfloat> <bfloat 1 .0 , bfloat 1 .0 >)
@@ -391,11 +397,12 @@ define amdgpu_ps float @test_clamp_v2bf16_folding(<2 x bfloat> %src0, <2 x bfloa
391397define amdgpu_ps void @v_test_mul_add_v2bf16_vvv (ptr addrspace (1 ) %out , <2 x bfloat> %a , <2 x bfloat> %b , <2 x bfloat> %c ) {
392398; GCN-LABEL: v_test_mul_add_v2bf16_vvv:
393399; GCN: ; %bb.0:
394- ; GCN-NEXT: v_pk_mul_bf16 v2, v2, v3
395- ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
396- ; GCN-NEXT: v_pk_add_bf16 v2, v2, v4
400+ ; GCN-NEXT: v_pk_fma_bf16 v2, v2, v3, v4
397401; GCN-NEXT: global_store_b32 v[0:1], v2, off
398402; GCN-NEXT: s_endpgm
403+
404+
405+
399406 %mul = fmul contract <2 x bfloat> %a , %b
400407 %add = fadd contract <2 x bfloat> %mul , %c
401408 store <2 x bfloat> %add , ptr addrspace (1 ) %out
@@ -405,11 +412,12 @@ define amdgpu_ps void @v_test_mul_add_v2bf16_vvv(ptr addrspace(1) %out, <2 x bfl
405412define amdgpu_ps void @v_test_mul_add_v2bf16_vss (ptr addrspace (1 ) %out , <2 x bfloat> %a , <2 x bfloat> inreg %b , <2 x bfloat> inreg %c ) {
406413; GCN-LABEL: v_test_mul_add_v2bf16_vss:
407414; GCN: ; %bb.0:
408- ; GCN-NEXT: v_pk_mul_bf16 v2, v2, s0
409- ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
410- ; GCN-NEXT: v_pk_add_bf16 v2, v2, s1
415+ ; GCN-NEXT: v_pk_fma_bf16 v2, v2, s0, s1
411416; GCN-NEXT: global_store_b32 v[0:1], v2, off
412417; GCN-NEXT: s_endpgm
418+
419+
420+
413421 %mul = fmul contract <2 x bfloat> %a , %b
414422 %add = fadd contract <2 x bfloat> %mul , %c
415423 store <2 x bfloat> %add , ptr addrspace (1 ) %out
@@ -419,11 +427,14 @@ define amdgpu_ps void @v_test_mul_add_v2bf16_vss(ptr addrspace(1) %out, <2 x bfl
419427define amdgpu_ps void @v_test_mul_add_v2bf16_sss (ptr addrspace (1 ) %out , <2 x bfloat> inreg %a , <2 x bfloat> inreg %b , <2 x bfloat> inreg %c ) {
420428; GCN-LABEL: v_test_mul_add_v2bf16_sss:
421429; GCN: ; %bb.0:
422- ; GCN-NEXT: v_pk_mul_bf16 v2, s0, s1
430+ ; GCN-NEXT: v_mov_b32_e32 v2, s2
423431; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
424- ; GCN-NEXT: v_pk_add_bf16 v2, v2, s2
432+ ; GCN-NEXT: v_pk_fma_bf16 v2, s0, s1, v2
425433; GCN-NEXT: global_store_b32 v[0:1], v2, off
426434; GCN-NEXT: s_endpgm
435+
436+
437+
427438 %mul = fmul contract <2 x bfloat> %a , %b
428439 %add = fadd contract <2 x bfloat> %mul , %c
429440 store <2 x bfloat> %add , ptr addrspace (1 ) %out
@@ -433,11 +444,12 @@ define amdgpu_ps void @v_test_mul_add_v2bf16_sss(ptr addrspace(1) %out, <2 x bfl
433444define amdgpu_ps void @v_test_mul_add_v2bf16_vsc (ptr addrspace (1 ) %out , <2 x bfloat> %a , <2 x bfloat> inreg %b ) {
434445; GCN-LABEL: v_test_mul_add_v2bf16_vsc:
435446; GCN: ; %bb.0:
436- ; GCN-NEXT: v_pk_mul_bf16 v2, v2, s0
437- ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
438- ; GCN-NEXT: v_pk_add_bf16 v2, v2, 0.5 op_sel_hi:[1,0]
447+ ; GCN-NEXT: v_pk_fma_bf16 v2, v2, s0, 0.5 op_sel_hi:[1,1,0]
439448; GCN-NEXT: global_store_b32 v[0:1], v2, off
440449; GCN-NEXT: s_endpgm
450+
451+
452+
441453 %mul = fmul contract <2 x bfloat> %a , %b
442454 %add = fadd contract <2 x bfloat> %mul , <bfloat 0 .5 , bfloat 0 .5 >
443455 store <2 x bfloat> %add , ptr addrspace (1 ) %out
@@ -447,11 +459,14 @@ define amdgpu_ps void @v_test_mul_add_v2bf16_vsc(ptr addrspace(1) %out, <2 x bfl
447459define amdgpu_ps void @v_test_mul_add_v2bf16_vll (ptr addrspace (1 ) %out , <2 x bfloat> %a ) {
448460; GCN-LABEL: v_test_mul_add_v2bf16_vll:
449461; GCN: ; %bb.0:
450- ; GCN-NEXT: v_pk_mul_bf16 v2, 0x42c83f80, v2
451- ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1 )
452- ; GCN-NEXT: v_pk_add_bf16 v2, 0x43484000 , v2
462+ ; GCN-NEXT: s_mov_b32 s0, 0x43484000
463+ ; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1 )
464+ ; GCN-NEXT: v_pk_fma_bf16 v2, 0x42c83f80 , v2, s0
453465; GCN-NEXT: global_store_b32 v[0:1], v2, off
454466; GCN-NEXT: s_endpgm
467+
468+
469+
455470 %mul = fmul contract <2 x bfloat> %a , <bfloat 1 .0 , bfloat 100 .0 >
456471 %add = fadd contract <2 x bfloat> %mul , <bfloat 2 .0 , bfloat 200 .0 >
457472 store <2 x bfloat> %add , ptr addrspace (1 ) %out
0 commit comments