@@ -323,6 +323,146 @@ define amdgpu_ps void @v_test_max_v2bf16_vl(ptr addrspace(1) %out, <2 x bfloat>
323323 ret void
324324}
325325
326+ define amdgpu_ps bfloat @test_clamp_bf16 (bfloat %src ) {
327+ ; GCN-LABEL: test_clamp_bf16:
328+ ; GCN: ; %bb.0:
329+ ; GCN-NEXT: v_pk_max_num_bf16 v0, v0, v0 clamp
330+ ; GCN-NEXT: ; return to shader part epilog
331+ %max = call bfloat @llvm.maxnum.bf16 (bfloat %src , bfloat 0 .0 )
332+ %clamp = call bfloat @llvm.minnum.bf16 (bfloat %max , bfloat 1 .0 )
333+ ret bfloat %clamp
334+ }
335+
336+ define amdgpu_ps bfloat @test_clamp_bf16_s (bfloat inreg %src ) {
337+ ; GCN-LABEL: test_clamp_bf16_s:
338+ ; GCN: ; %bb.0:
339+ ; GCN-NEXT: v_pk_max_num_bf16 v0, s0, s0 clamp
340+ ; GCN-NEXT: ; return to shader part epilog
341+ %max = call bfloat @llvm.maxnum.bf16 (bfloat %src , bfloat 0 .0 )
342+ %clamp = call bfloat @llvm.minnum.bf16 (bfloat %max , bfloat 1 .0 )
343+ ret bfloat %clamp
344+ }
345+
346+ define amdgpu_ps float @test_clamp_v2bf16 (<2 x bfloat> %src ) {
347+ ; GCN-LABEL: test_clamp_v2bf16:
348+ ; GCN: ; %bb.0:
349+ ; GCN-NEXT: v_pk_max_num_bf16 v0, v0, v0 clamp
350+ ; GCN-NEXT: ; return to shader part epilog
351+ %max = call <2 x bfloat> @llvm.maxnum.v2bf16 (<2 x bfloat> %src , <2 x bfloat> <bfloat 0 .0 , bfloat 0 .0 >)
352+ %clamp = call <2 x bfloat> @llvm.minnum.v2bf16 (<2 x bfloat> %max , <2 x bfloat> <bfloat 1 .0 , bfloat 1 .0 >)
353+ %ret = bitcast <2 x bfloat> %clamp to float
354+ ret float %ret
355+ }
356+
357+ define amdgpu_ps float @test_clamp_v2bf16_s (<2 x bfloat> inreg %src ) {
358+ ; GCN-LABEL: test_clamp_v2bf16_s:
359+ ; GCN: ; %bb.0:
360+ ; GCN-NEXT: v_pk_max_num_bf16 v0, s0, s0 clamp
361+ ; GCN-NEXT: ; return to shader part epilog
362+ %max = call <2 x bfloat> @llvm.maxnum.v2bf16 (<2 x bfloat> %src , <2 x bfloat> <bfloat 0 .0 , bfloat 0 .0 >)
363+ %clamp = call <2 x bfloat> @llvm.minnum.v2bf16 (<2 x bfloat> %max , <2 x bfloat> <bfloat 1 .0 , bfloat 1 .0 >)
364+ %ret = bitcast <2 x bfloat> %clamp to float
365+ ret float %ret
366+ }
367+
368+ define amdgpu_ps bfloat @test_clamp_bf16_folding (bfloat %src ) {
369+ ; GCN-LABEL: test_clamp_bf16_folding:
370+ ; GCN: ; %bb.0:
371+ ; GCN-NEXT: v_exp_bf16_e32 v0, v0
372+ ; GCN-NEXT: v_nop
373+ ; GCN-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
374+ ; GCN-NEXT: v_pk_max_num_bf16 v0, v0, v0 clamp
375+ ; GCN-NEXT: ; return to shader part epilog
376+ %exp = call bfloat @llvm.exp2.bf16 (bfloat %src )
377+ %max = call bfloat @llvm.maxnum.bf16 (bfloat %exp , bfloat 0 .0 )
378+ %clamp = call bfloat @llvm.minnum.bf16 (bfloat %max , bfloat 1 .0 )
379+ ret bfloat %clamp
380+ }
381+
382+ define amdgpu_ps float @test_clamp_v2bf16_folding (<2 x bfloat> %src0 , <2 x bfloat> %src1 ) {
383+ ; GCN-LABEL: test_clamp_v2bf16_folding:
384+ ; GCN: ; %bb.0:
385+ ; GCN-NEXT: v_pk_mul_bf16 v0, v0, v1
386+ ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
387+ ; GCN-NEXT: v_pk_max_num_bf16 v0, v0, v0 clamp
388+ ; GCN-NEXT: ; return to shader part epilog
389+ %mul = fmul <2 x bfloat> %src0 , %src1
390+ %max = call <2 x bfloat> @llvm.maxnum.v2bf16 (<2 x bfloat> %mul , <2 x bfloat> <bfloat 0 .0 , bfloat 0 .0 >)
391+ %clamp = call <2 x bfloat> @llvm.minnum.v2bf16 (<2 x bfloat> %max , <2 x bfloat> <bfloat 1 .0 , bfloat 1 .0 >)
392+ %ret = bitcast <2 x bfloat> %clamp to float
393+ ret float %ret
394+ }
395+
396+ define amdgpu_ps void @v_test_mul_add_v2bf16_vvv (ptr addrspace (1 ) %out , <2 x bfloat> %a , <2 x bfloat> %b , <2 x bfloat> %c ) {
397+ ; GCN-LABEL: v_test_mul_add_v2bf16_vvv:
398+ ; GCN: ; %bb.0:
399+ ; GCN-NEXT: v_pk_mul_bf16 v2, v2, v3
400+ ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
401+ ; GCN-NEXT: v_pk_add_bf16 v2, v2, v4
402+ ; GCN-NEXT: global_store_b32 v[0:1], v2, off
403+ ; GCN-NEXT: s_endpgm
404+ %mul = fmul contract <2 x bfloat> %a , %b
405+ %add = fadd contract <2 x bfloat> %mul , %c
406+ store <2 x bfloat> %add , ptr addrspace (1 ) %out
407+ ret void
408+ }
409+
410+ define amdgpu_ps void @v_test_mul_add_v2bf16_vss (ptr addrspace (1 ) %out , <2 x bfloat> %a , <2 x bfloat> inreg %b , <2 x bfloat> inreg %c ) {
411+ ; GCN-LABEL: v_test_mul_add_v2bf16_vss:
412+ ; GCN: ; %bb.0:
413+ ; GCN-NEXT: v_pk_mul_bf16 v2, v2, s0
414+ ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
415+ ; GCN-NEXT: v_pk_add_bf16 v2, v2, s1
416+ ; GCN-NEXT: global_store_b32 v[0:1], v2, off
417+ ; GCN-NEXT: s_endpgm
418+ %mul = fmul contract <2 x bfloat> %a , %b
419+ %add = fadd contract <2 x bfloat> %mul , %c
420+ store <2 x bfloat> %add , ptr addrspace (1 ) %out
421+ ret void
422+ }
423+
424+ define amdgpu_ps void @v_test_mul_add_v2bf16_sss (ptr addrspace (1 ) %out , <2 x bfloat> inreg %a , <2 x bfloat> inreg %b , <2 x bfloat> inreg %c ) {
425+ ; GCN-LABEL: v_test_mul_add_v2bf16_sss:
426+ ; GCN: ; %bb.0:
427+ ; GCN-NEXT: v_pk_mul_bf16 v2, s0, s1
428+ ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
429+ ; GCN-NEXT: v_pk_add_bf16 v2, v2, s2
430+ ; GCN-NEXT: global_store_b32 v[0:1], v2, off
431+ ; GCN-NEXT: s_endpgm
432+ %mul = fmul contract <2 x bfloat> %a , %b
433+ %add = fadd contract <2 x bfloat> %mul , %c
434+ store <2 x bfloat> %add , ptr addrspace (1 ) %out
435+ ret void
436+ }
437+
438+ define amdgpu_ps void @v_test_mul_add_v2bf16_vsc (ptr addrspace (1 ) %out , <2 x bfloat> %a , <2 x bfloat> inreg %b ) {
439+ ; GCN-LABEL: v_test_mul_add_v2bf16_vsc:
440+ ; GCN: ; %bb.0:
441+ ; GCN-NEXT: v_pk_mul_bf16 v2, v2, s0
442+ ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
443+ ; GCN-NEXT: v_pk_add_bf16 v2, v2, 0.5 op_sel_hi:[1,0]
444+ ; GCN-NEXT: global_store_b32 v[0:1], v2, off
445+ ; GCN-NEXT: s_endpgm
446+ %mul = fmul contract <2 x bfloat> %a , %b
447+ %add = fadd contract <2 x bfloat> %mul , <bfloat 0 .5 , bfloat 0 .5 >
448+ store <2 x bfloat> %add , ptr addrspace (1 ) %out
449+ ret void
450+ }
451+
452+ define amdgpu_ps void @v_test_mul_add_v2bf16_vll (ptr addrspace (1 ) %out , <2 x bfloat> %a ) {
453+ ; GCN-LABEL: v_test_mul_add_v2bf16_vll:
454+ ; GCN: ; %bb.0:
455+ ; GCN-NEXT: v_pk_mul_bf16 v2, 0x42c83f80, v2
456+ ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
457+ ; GCN-NEXT: v_pk_add_bf16 v2, 0x43484000, v2
458+ ; GCN-NEXT: global_store_b32 v[0:1], v2, off
459+ ; GCN-NEXT: s_endpgm
460+ %mul = fmul contract <2 x bfloat> %a , <bfloat 1 .0 , bfloat 100 .0 >
461+ %add = fadd contract <2 x bfloat> %mul , <bfloat 2 .0 , bfloat 200 .0 >
462+ store <2 x bfloat> %add , ptr addrspace (1 ) %out
463+ ret void
464+ }
465+
326466define amdgpu_ps void @v_test_fma_v2bf16_vvv (ptr addrspace (1 ) %out , <2 x bfloat> %a , <2 x bfloat> %b , <2 x bfloat> %c ) {
327467; GCN-LABEL: v_test_fma_v2bf16_vvv:
328468; GCN: ; %bb.0:
@@ -426,6 +566,8 @@ define amdgpu_ps void @llvm_exp2_bf16_s(ptr addrspace(1) %out, bfloat inreg %src
426566 ret void
427567}
428568
569+ declare bfloat @llvm.minnum.bf16 (bfloat, bfloat)
570+ declare bfloat @llvm.maxnum.bf16 (bfloat, bfloat)
429571declare <2 x bfloat> @llvm.minnum.v2bf16 (<2 x bfloat> %a , <2 x bfloat> %b )
430572declare <2 x bfloat> @llvm.maxnum.v2bf16 (<2 x bfloat> %a , <2 x bfloat> %b )
431573declare <2 x bfloat> @llvm.fma.v2bf16 (<2 x bfloat>, <2 x bfloat>, <2 x bfloat>)
0 commit comments