33; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX8 %s
44; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX9 %s
55; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX10 %s
6- ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX11 %s
7- ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX12 %s
6+ ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GFX11,GFX11-TRUE16 %s
7+ ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GFX11,GFX11-FAKE16 %s
8+ ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GFX12,GFX12-TRUE16 %s
9+ ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GFX12,GFX12-FAKE16 %s
810
911define amdgpu_kernel void @cos_f16 (ptr addrspace (1 ) %r , ptr addrspace (1 ) %a ) {
1012; GFX6-LABEL: cos_f16:
@@ -69,31 +71,57 @@ define amdgpu_kernel void @cos_f16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
6971; GFX10-NEXT: global_store_short v0, v1, s[0:1]
7072; GFX10-NEXT: s_endpgm
7173;
72- ; GFX11-LABEL: cos_f16:
73- ; GFX11: ; %bb.0:
74- ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
75- ; GFX11-NEXT: v_mov_b32_e32 v0 , 0
76- ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
77- ; GFX11-NEXT: global_load_u16 v1, v0 , s[2:3]
78- ; GFX11-NEXT: s_waitcnt vmcnt(0)
79- ; GFX11-NEXT: v_mul_f16_e32 v1 , 0.15915494, v1
80- ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
81- ; GFX11-NEXT: v_cos_f16_e32 v1, v1
82- ; GFX11-NEXT: global_store_b16 v0, v1 , s[0:1]
83- ; GFX11-NEXT: s_endpgm
74+ ; GFX11-TRUE16- LABEL: cos_f16:
75+ ; GFX11-TRUE16 : ; %bb.0:
76+ ; GFX11-TRUE16- NEXT: s_load_b128 s[0:3], s[4:5], 0x24
77+ ; GFX11-TRUE16- NEXT: v_mov_b32_e32 v1 , 0
78+ ; GFX11-TRUE16- NEXT: s_waitcnt lgkmcnt(0)
79+ ; GFX11-TRUE16- NEXT: global_load_d16_b16 v0, v1 , s[2:3]
80+ ; GFX11-TRUE16- NEXT: s_waitcnt vmcnt(0)
81+ ; GFX11-TRUE16- NEXT: v_mul_f16_e32 v0.l , 0.15915494, v0.l
82+ ; GFX11-TRUE16- NEXT: s_delay_alu instid0(VALU_DEP_1)
83+ ; GFX11-TRUE16- NEXT: v_cos_f16_e32 v0.l, v0.l
84+ ; GFX11-TRUE16- NEXT: global_store_b16 v1, v0 , s[0:1]
85+ ; GFX11-TRUE16- NEXT: s_endpgm
8486;
85- ; GFX12-LABEL: cos_f16:
86- ; GFX12: ; %bb.0:
87- ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
88- ; GFX12-NEXT: v_mov_b32_e32 v0, 0
89- ; GFX12-NEXT: s_wait_kmcnt 0x0
90- ; GFX12-NEXT: global_load_u16 v1, v0, s[2:3]
91- ; GFX12-NEXT: s_wait_loadcnt 0x0
92- ; GFX12-NEXT: v_mul_f16_e32 v1, 0.15915494, v1
93- ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
94- ; GFX12-NEXT: v_cos_f16_e32 v1, v1
95- ; GFX12-NEXT: global_store_b16 v0, v1, s[0:1]
96- ; GFX12-NEXT: s_endpgm
87+ ; GFX11-FAKE16-LABEL: cos_f16:
88+ ; GFX11-FAKE16: ; %bb.0:
89+ ; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
90+ ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0
91+ ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
92+ ; GFX11-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3]
93+ ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
94+ ; GFX11-FAKE16-NEXT: v_mul_f16_e32 v1, 0.15915494, v1
95+ ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
96+ ; GFX11-FAKE16-NEXT: v_cos_f16_e32 v1, v1
97+ ; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
98+ ; GFX11-FAKE16-NEXT: s_endpgm
99+ ;
100+ ; GFX12-TRUE16-LABEL: cos_f16:
101+ ; GFX12-TRUE16: ; %bb.0:
102+ ; GFX12-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
103+ ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, 0
104+ ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
105+ ; GFX12-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[2:3]
106+ ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
107+ ; GFX12-TRUE16-NEXT: v_mul_f16_e32 v0.l, 0.15915494, v0.l
108+ ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
109+ ; GFX12-TRUE16-NEXT: v_cos_f16_e32 v0.l, v0.l
110+ ; GFX12-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
111+ ; GFX12-TRUE16-NEXT: s_endpgm
112+ ;
113+ ; GFX12-FAKE16-LABEL: cos_f16:
114+ ; GFX12-FAKE16: ; %bb.0:
115+ ; GFX12-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
116+ ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, 0
117+ ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
118+ ; GFX12-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3]
119+ ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
120+ ; GFX12-FAKE16-NEXT: v_mul_f16_e32 v1, 0.15915494, v1
121+ ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
122+ ; GFX12-FAKE16-NEXT: v_cos_f16_e32 v1, v1
123+ ; GFX12-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
124+ ; GFX12-FAKE16-NEXT: s_endpgm
97125 %a.val = load half , ptr addrspace (1 ) %a
98126 %r.val = call half @llvm.cos.f16 (half %a.val )
99127 store half %r.val , ptr addrspace (1 ) %r
@@ -184,42 +212,79 @@ define amdgpu_kernel void @cos_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
184212; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
185213; GFX10-NEXT: s_endpgm
186214;
187- ; GFX11-LABEL: cos_v2f16:
188- ; GFX11: ; %bb.0:
189- ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
190- ; GFX11-NEXT: v_mov_b32_e32 v0, 0
191- ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
192- ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
193- ; GFX11-NEXT: s_waitcnt vmcnt(0)
194- ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1
195- ; GFX11-NEXT: v_mul_f16_e32 v1, 0.15915494, v1
196- ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
197- ; GFX11-NEXT: v_mul_f16_e32 v2, 0.15915494, v2
198- ; GFX11-NEXT: v_cos_f16_e32 v1, v1
199- ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
200- ; GFX11-NEXT: v_cos_f16_e32 v2, v2
201- ; GFX11-NEXT: s_waitcnt_depctr 0xfff
202- ; GFX11-NEXT: v_pack_b32_f16 v1, v1, v2
203- ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
204- ; GFX11-NEXT: s_endpgm
215+ ; GFX11-TRUE16-LABEL: cos_v2f16:
216+ ; GFX11-TRUE16: ; %bb.0:
217+ ; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
218+ ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
219+ ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
220+ ; GFX11-TRUE16-NEXT: global_load_b32 v0, v1, s[2:3]
221+ ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
222+ ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0
223+ ; GFX11-TRUE16-NEXT: v_mul_f16_e32 v0.l, 0.15915494, v0.l
224+ ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
225+ ; GFX11-TRUE16-NEXT: v_mul_f16_e32 v0.h, 0.15915494, v2.l
226+ ; GFX11-TRUE16-NEXT: v_cos_f16_e32 v0.l, v0.l
227+ ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
228+ ; GFX11-TRUE16-NEXT: v_cos_f16_e32 v0.h, v0.h
229+ ; GFX11-TRUE16-NEXT: s_waitcnt_depctr 0xfff
230+ ; GFX11-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h
231+ ; GFX11-TRUE16-NEXT: global_store_b32 v1, v0, s[0:1]
232+ ; GFX11-TRUE16-NEXT: s_endpgm
233+ ;
234+ ; GFX11-FAKE16-LABEL: cos_v2f16:
235+ ; GFX11-FAKE16: ; %bb.0:
236+ ; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
237+ ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0
238+ ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
239+ ; GFX11-FAKE16-NEXT: global_load_b32 v1, v0, s[2:3]
240+ ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
241+ ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1
242+ ; GFX11-FAKE16-NEXT: v_mul_f16_e32 v1, 0.15915494, v1
243+ ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
244+ ; GFX11-FAKE16-NEXT: v_mul_f16_e32 v2, 0.15915494, v2
245+ ; GFX11-FAKE16-NEXT: v_cos_f16_e32 v1, v1
246+ ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
247+ ; GFX11-FAKE16-NEXT: v_cos_f16_e32 v2, v2
248+ ; GFX11-FAKE16-NEXT: s_waitcnt_depctr 0xfff
249+ ; GFX11-FAKE16-NEXT: v_pack_b32_f16 v1, v1, v2
250+ ; GFX11-FAKE16-NEXT: global_store_b32 v0, v1, s[0:1]
251+ ; GFX11-FAKE16-NEXT: s_endpgm
252+ ;
253+ ; GFX12-TRUE16-LABEL: cos_v2f16:
254+ ; GFX12-TRUE16: ; %bb.0:
255+ ; GFX12-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
256+ ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, 0
257+ ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
258+ ; GFX12-TRUE16-NEXT: global_load_b32 v2, v1, s[2:3]
259+ ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
260+ ; GFX12-TRUE16-NEXT: v_mul_f16_e32 v0.l, 0.15915494, v2.l
261+ ; GFX12-TRUE16-NEXT: v_mul_f16_e32 v0.h, 0.15915494, v2.l
262+ ; GFX12-TRUE16-NEXT: ; kill: def $vgpr2 killed $vgpr2_lo16 killed $exec
263+ ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
264+ ; GFX12-TRUE16-NEXT: v_cos_f16_e32 v0.l, v0.l
265+ ; GFX12-TRUE16-NEXT: v_cos_f16_e32 v0.h, v0.h
266+ ; GFX12-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
267+ ; GFX12-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h
268+ ; GFX12-TRUE16-NEXT: global_store_b32 v1, v0, s[0:1]
269+ ; GFX12-TRUE16-NEXT: s_endpgm
205270;
206- ; GFX12-LABEL: cos_v2f16:
207- ; GFX12: ; %bb.0:
208- ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
209- ; GFX12-NEXT: v_mov_b32_e32 v0, 0
210- ; GFX12-NEXT: s_wait_kmcnt 0x0
211- ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3]
212- ; GFX12-NEXT: s_wait_loadcnt 0x0
213- ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v1
214- ; GFX12-NEXT: v_mul_f16_e32 v1, 0.15915494, v1
215- ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
216- ; GFX12-NEXT: v_mul_f16_e32 v2, 0.15915494, v2
217- ; GFX12-NEXT: v_cos_f16_e32 v1, v1
218- ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
219- ; GFX12-NEXT: v_cos_f16_e32 v2, v2
220- ; GFX12-NEXT: v_pack_b32_f16 v1, v1, v2
221- ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
222- ; GFX12-NEXT: s_endpgm
271+ ; GFX12-FAKE16- LABEL: cos_v2f16:
272+ ; GFX12-FAKE16 : ; %bb.0:
273+ ; GFX12-FAKE16- NEXT: s_load_b128 s[0:3], s[4:5], 0x24
274+ ; GFX12-FAKE16- NEXT: v_mov_b32_e32 v0, 0
275+ ; GFX12-FAKE16- NEXT: s_wait_kmcnt 0x0
276+ ; GFX12-FAKE16- NEXT: global_load_b32 v1, v0, s[2:3]
277+ ; GFX12-FAKE16- NEXT: s_wait_loadcnt 0x0
278+ ; GFX12-FAKE16- NEXT: v_lshrrev_b32_e32 v2, 16, v1
279+ ; GFX12-FAKE16- NEXT: v_mul_f16_e32 v1, 0.15915494, v1
280+ ; GFX12-FAKE16- NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
281+ ; GFX12-FAKE16- NEXT: v_mul_f16_e32 v2, 0.15915494, v2
282+ ; GFX12-FAKE16- NEXT: v_cos_f16_e32 v1, v1
283+ ; GFX12-FAKE16- NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
284+ ; GFX12-FAKE16- NEXT: v_cos_f16_e32 v2, v2
285+ ; GFX12-FAKE16- NEXT: v_pack_b32_f16 v1, v1, v2
286+ ; GFX12-FAKE16- NEXT: global_store_b32 v0, v1, s[0:1]
287+ ; GFX12-FAKE16- NEXT: s_endpgm
223288 %a.val = load <2 x half >, ptr addrspace (1 ) %a
224289 %r.val = call <2 x half > @llvm.cos.v2f16 (<2 x half > %a.val )
225290 store <2 x half > %r.val , ptr addrspace (1 ) %r
@@ -228,3 +293,6 @@ define amdgpu_kernel void @cos_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
228293
229294declare half @llvm.cos.f16 (half %a )
230295declare <2 x half > @llvm.cos.v2f16 (<2 x half > %a )
296+ ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
297+ ; GFX11: {{.*}}
298+ ; GFX12: {{.*}}
0 commit comments