@@ -31,5 +31,71 @@ define amdgpu_kernel void @rcp_bf16(ptr addrspace(1) %out, bfloat %src) #1 {
3131 ret void
3232}
3333
34+ define amdgpu_kernel void @rcp_bf16_constant_4 (ptr addrspace (1 ) %out ) #1 {
35+ ; SDAG-TRUE16-LABEL: rcp_bf16_constant_4:
36+ ; SDAG-TRUE16: ; %bb.0:
37+ ; SDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
38+ ; SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0x3e80
39+ ; SDAG-TRUE16-NEXT: v_mov_b32_e32 v1, 0
40+ ; SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
41+ ; SDAG-TRUE16-NEXT: flat_store_b16 v1, v0, s[0:1]
42+ ; SDAG-TRUE16-NEXT: s_endpgm
43+ ;
44+ ; SDAG-FAKE16-LABEL: rcp_bf16_constant_4:
45+ ; SDAG-FAKE16: ; %bb.0:
46+ ; SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
47+ ; SDAG-FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3e80
48+ ; SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
49+ ; SDAG-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
50+ ; SDAG-FAKE16-NEXT: s_endpgm
51+ %rcp = call bfloat @llvm.amdgcn.rcp.bf16 (bfloat 4 .0 ) #0
52+ store bfloat %rcp , ptr addrspace (1 ) %out , align 2
53+ ret void
54+ }
55+
56+ define amdgpu_kernel void @rcp_bf16_constant_100 (ptr addrspace (1 ) %out ) #1 {
57+ ; SDAG-TRUE16-LABEL: rcp_bf16_constant_100:
58+ ; SDAG-TRUE16: ; %bb.0:
59+ ; SDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
60+ ; SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0x3c24
61+ ; SDAG-TRUE16-NEXT: v_mov_b32_e32 v1, 0
62+ ; SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
63+ ; SDAG-TRUE16-NEXT: flat_store_b16 v1, v0, s[0:1]
64+ ; SDAG-TRUE16-NEXT: s_endpgm
65+ ;
66+ ; SDAG-FAKE16-LABEL: rcp_bf16_constant_100:
67+ ; SDAG-FAKE16: ; %bb.0:
68+ ; SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
69+ ; SDAG-FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3c24
70+ ; SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
71+ ; SDAG-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
72+ ; SDAG-FAKE16-NEXT: s_endpgm
73+ %rcp = call bfloat @llvm.amdgcn.rcp.bf16 (bfloat 100 .0 ) #0
74+ store bfloat %rcp , ptr addrspace (1 ) %out , align 2
75+ ret void
76+ }
77+
78+ define amdgpu_kernel void @rcp_undef_bf16 (ptr addrspace (1 ) %out ) #1 {
79+ ; SDAG-TRUE16-LABEL: rcp_undef_bf16:
80+ ; SDAG-TRUE16: ; %bb.0:
81+ ; SDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
82+ ; SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0x7fc0
83+ ; SDAG-TRUE16-NEXT: v_mov_b32_e32 v1, 0
84+ ; SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
85+ ; SDAG-TRUE16-NEXT: flat_store_b16 v1, v0, s[0:1]
86+ ; SDAG-TRUE16-NEXT: s_endpgm
87+ ;
88+ ; SDAG-FAKE16-LABEL: rcp_undef_bf16:
89+ ; SDAG-FAKE16: ; %bb.0:
90+ ; SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
91+ ; SDAG-FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc0
92+ ; SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
93+ ; SDAG-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
94+ ; SDAG-FAKE16-NEXT: s_endpgm
95+ %rcp = call bfloat @llvm.amdgcn.rcp.bf16 (bfloat undef )
96+ store bfloat %rcp , ptr addrspace (1 ) %out , align 2
97+ ret void
98+ }
99+
34100attributes #0 = { nounwind readnone }
35101attributes #1 = { nounwind }
0 commit comments