@@ -31,5 +31,68 @@ define amdgpu_kernel void @rcp_bf16(ptr addrspace(1) %out, bfloat %src) #1 {
31
31
ret void
32
32
}
33
33
34
+ define amdgpu_kernel void @rcp_bf16_constant_4 (ptr addrspace (1 ) %out ) #1 {
35
+ ; SDAG-TRUE16-LABEL: rcp_bf16_constant_4:
36
+ ; SDAG-TRUE16: ; %bb.0:
37
+ ; SDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
38
+ ; SDAG-TRUE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3e80
39
+ ; SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
40
+ ; SDAG-TRUE16-NEXT: flat_store_b16 v0, v1, s[0:1]
41
+ ; SDAG-TRUE16-NEXT: s_endpgm
42
+ ;
43
+ ; SDAG-FAKE16-LABEL: rcp_bf16_constant_4:
44
+ ; SDAG-FAKE16: ; %bb.0:
45
+ ; SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
46
+ ; SDAG-FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3e80
47
+ ; SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
48
+ ; SDAG-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
49
+ ; SDAG-FAKE16-NEXT: s_endpgm
50
+ %rcp = call bfloat @llvm.amdgcn.rcp.bf16 (bfloat 4 .0 ) #0
51
+ store bfloat %rcp , ptr addrspace (1 ) %out , align 2
52
+ ret void
53
+ }
54
+
55
+ define amdgpu_kernel void @rcp_bf16_constant_100 (ptr addrspace (1 ) %out ) #1 {
56
+ ; SDAG-TRUE16-LABEL: rcp_bf16_constant_100:
57
+ ; SDAG-TRUE16: ; %bb.0:
58
+ ; SDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
59
+ ; SDAG-TRUE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3c24
60
+ ; SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
61
+ ; SDAG-TRUE16-NEXT: flat_store_b16 v0, v1, s[0:1]
62
+ ; SDAG-TRUE16-NEXT: s_endpgm
63
+ ;
64
+ ; SDAG-FAKE16-LABEL: rcp_bf16_constant_100:
65
+ ; SDAG-FAKE16: ; %bb.0:
66
+ ; SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
67
+ ; SDAG-FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3c24
68
+ ; SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
69
+ ; SDAG-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
70
+ ; SDAG-FAKE16-NEXT: s_endpgm
71
+ %rcp = call bfloat @llvm.amdgcn.rcp.bf16 (bfloat 100 .0 ) #0
72
+ store bfloat %rcp , ptr addrspace (1 ) %out , align 2
73
+ ret void
74
+ }
75
+
76
+ define amdgpu_kernel void @rcp_undef_bf16 (ptr addrspace (1 ) %out ) #1 {
77
+ ; SDAG-TRUE16-LABEL: rcp_undef_bf16:
78
+ ; SDAG-TRUE16: ; %bb.0:
79
+ ; SDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
80
+ ; SDAG-TRUE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc0
81
+ ; SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
82
+ ; SDAG-TRUE16-NEXT: flat_store_b16 v0, v1, s[0:1]
83
+ ; SDAG-TRUE16-NEXT: s_endpgm
84
+ ;
85
+ ; SDAG-FAKE16-LABEL: rcp_undef_bf16:
86
+ ; SDAG-FAKE16: ; %bb.0:
87
+ ; SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
88
+ ; SDAG-FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc0
89
+ ; SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
90
+ ; SDAG-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
91
+ ; SDAG-FAKE16-NEXT: s_endpgm
92
+ %rcp = call bfloat @llvm.amdgcn.rcp.bf16 (bfloat undef )
93
+ store bfloat %rcp , ptr addrspace (1 ) %out , align 2
94
+ ret void
95
+ }
96
+
34
97
attributes #0 = { nounwind readnone }
35
98
attributes #1 = { nounwind }
0 commit comments