@@ -370,6 +370,9 @@ define amdgpu_ps bfloat @test_clamp_bf16_folding(bfloat %src) {
370
370
; GCN: ; %bb.0:
371
371
; GCN-NEXT: v_exp_bf16_e64 v0, v0 clamp
372
372
; GCN-NEXT: ; return to shader part epilog
373
+
374
+
375
+
373
376
%exp = call bfloat @llvm.exp2.bf16 (bfloat %src )
374
377
%max = call bfloat @llvm.maxnum.bf16 (bfloat %exp , bfloat 0 .0 )
375
378
%clamp = call bfloat @llvm.minnum.bf16 (bfloat %max , bfloat 1 .0 )
@@ -381,6 +384,9 @@ define amdgpu_ps float @test_clamp_v2bf16_folding(<2 x bfloat> %src0, <2 x bfloa
381
384
; GCN: ; %bb.0:
382
385
; GCN-NEXT: v_pk_mul_bf16 v0, v0, v1 clamp
383
386
; GCN-NEXT: ; return to shader part epilog
387
+
388
+
389
+
384
390
%mul = fmul <2 x bfloat> %src0 , %src1
385
391
%max = call <2 x bfloat> @llvm.maxnum.v2bf16 (<2 x bfloat> %mul , <2 x bfloat> <bfloat 0 .0 , bfloat 0 .0 >)
386
392
%clamp = call <2 x bfloat> @llvm.minnum.v2bf16 (<2 x bfloat> %max , <2 x bfloat> <bfloat 1 .0 , bfloat 1 .0 >)
@@ -391,11 +397,12 @@ define amdgpu_ps float @test_clamp_v2bf16_folding(<2 x bfloat> %src0, <2 x bfloa
391
397
define amdgpu_ps void @v_test_mul_add_v2bf16_vvv (ptr addrspace (1 ) %out , <2 x bfloat> %a , <2 x bfloat> %b , <2 x bfloat> %c ) {
392
398
; GCN-LABEL: v_test_mul_add_v2bf16_vvv:
393
399
; GCN: ; %bb.0:
394
- ; GCN-NEXT: v_pk_mul_bf16 v2, v2, v3
395
- ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
396
- ; GCN-NEXT: v_pk_add_bf16 v2, v2, v4
400
+ ; GCN-NEXT: v_pk_fma_bf16 v2, v2, v3, v4
397
401
; GCN-NEXT: global_store_b32 v[0:1], v2, off
398
402
; GCN-NEXT: s_endpgm
403
+
404
+
405
+
399
406
%mul = fmul contract <2 x bfloat> %a , %b
400
407
%add = fadd contract <2 x bfloat> %mul , %c
401
408
store <2 x bfloat> %add , ptr addrspace (1 ) %out
@@ -405,11 +412,12 @@ define amdgpu_ps void @v_test_mul_add_v2bf16_vvv(ptr addrspace(1) %out, <2 x bfl
405
412
define amdgpu_ps void @v_test_mul_add_v2bf16_vss (ptr addrspace (1 ) %out , <2 x bfloat> %a , <2 x bfloat> inreg %b , <2 x bfloat> inreg %c ) {
406
413
; GCN-LABEL: v_test_mul_add_v2bf16_vss:
407
414
; GCN: ; %bb.0:
408
- ; GCN-NEXT: v_pk_mul_bf16 v2, v2, s0
409
- ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
410
- ; GCN-NEXT: v_pk_add_bf16 v2, v2, s1
415
+ ; GCN-NEXT: v_pk_fma_bf16 v2, v2, s0, s1
411
416
; GCN-NEXT: global_store_b32 v[0:1], v2, off
412
417
; GCN-NEXT: s_endpgm
418
+
419
+
420
+
413
421
%mul = fmul contract <2 x bfloat> %a , %b
414
422
%add = fadd contract <2 x bfloat> %mul , %c
415
423
store <2 x bfloat> %add , ptr addrspace (1 ) %out
@@ -419,11 +427,14 @@ define amdgpu_ps void @v_test_mul_add_v2bf16_vss(ptr addrspace(1) %out, <2 x bfl
419
427
define amdgpu_ps void @v_test_mul_add_v2bf16_sss (ptr addrspace (1 ) %out , <2 x bfloat> inreg %a , <2 x bfloat> inreg %b , <2 x bfloat> inreg %c ) {
420
428
; GCN-LABEL: v_test_mul_add_v2bf16_sss:
421
429
; GCN: ; %bb.0:
422
- ; GCN-NEXT: v_pk_mul_bf16 v2, s0, s1
430
+ ; GCN-NEXT: v_mov_b32_e32 v2, s2
423
431
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
424
- ; GCN-NEXT: v_pk_add_bf16 v2, v2, s2
432
+ ; GCN-NEXT: v_pk_fma_bf16 v2, s0, s1, v2
425
433
; GCN-NEXT: global_store_b32 v[0:1], v2, off
426
434
; GCN-NEXT: s_endpgm
435
+
436
+
437
+
427
438
%mul = fmul contract <2 x bfloat> %a , %b
428
439
%add = fadd contract <2 x bfloat> %mul , %c
429
440
store <2 x bfloat> %add , ptr addrspace (1 ) %out
@@ -433,11 +444,12 @@ define amdgpu_ps void @v_test_mul_add_v2bf16_sss(ptr addrspace(1) %out, <2 x bfl
433
444
define amdgpu_ps void @v_test_mul_add_v2bf16_vsc (ptr addrspace (1 ) %out , <2 x bfloat> %a , <2 x bfloat> inreg %b ) {
434
445
; GCN-LABEL: v_test_mul_add_v2bf16_vsc:
435
446
; GCN: ; %bb.0:
436
- ; GCN-NEXT: v_pk_mul_bf16 v2, v2, s0
437
- ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
438
- ; GCN-NEXT: v_pk_add_bf16 v2, v2, 0.5 op_sel_hi:[1,0]
447
+ ; GCN-NEXT: v_pk_fma_bf16 v2, v2, s0, 0.5 op_sel_hi:[1,1,0]
439
448
; GCN-NEXT: global_store_b32 v[0:1], v2, off
440
449
; GCN-NEXT: s_endpgm
450
+
451
+
452
+
441
453
%mul = fmul contract <2 x bfloat> %a , %b
442
454
%add = fadd contract <2 x bfloat> %mul , <bfloat 0 .5 , bfloat 0 .5 >
443
455
store <2 x bfloat> %add , ptr addrspace (1 ) %out
@@ -447,11 +459,14 @@ define amdgpu_ps void @v_test_mul_add_v2bf16_vsc(ptr addrspace(1) %out, <2 x bfl
447
459
define amdgpu_ps void @v_test_mul_add_v2bf16_vll (ptr addrspace (1 ) %out , <2 x bfloat> %a ) {
448
460
; GCN-LABEL: v_test_mul_add_v2bf16_vll:
449
461
; GCN: ; %bb.0:
450
- ; GCN-NEXT: v_pk_mul_bf16 v2, 0x42c83f80, v2
451
- ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1 )
452
- ; GCN-NEXT: v_pk_add_bf16 v2, 0x43484000 , v2
462
+ ; GCN-NEXT: s_mov_b32 s0, 0x43484000
463
+ ; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1 )
464
+ ; GCN-NEXT: v_pk_fma_bf16 v2, 0x42c83f80 , v2, s0
453
465
; GCN-NEXT: global_store_b32 v[0:1], v2, off
454
466
; GCN-NEXT: s_endpgm
467
+
468
+
469
+
455
470
%mul = fmul contract <2 x bfloat> %a , <bfloat 1 .0 , bfloat 100 .0 >
456
471
%add = fadd contract <2 x bfloat> %mul , <bfloat 2 .0 , bfloat 200 .0 >
457
472
store <2 x bfloat> %add , ptr addrspace (1 ) %out
0 commit comments