11; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
22; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
33; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
4+ ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s
5+ ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s
46
57define amdgpu_ps void @v_interp_f32 (float inreg %i , float inreg %j , i32 inreg %m0 ) #0 {
68; GFX11-LABEL: v_interp_f32:
@@ -21,6 +23,25 @@ define amdgpu_ps void @v_interp_f32(float inreg %i, float inreg %j, i32 inreg %m
2123; GFX11-NEXT: v_interp_p2_f32 v4, v1, v4, v5 wait_exp:7
2224; GFX11-NEXT: exp mrt0 v3, v2, v5, v4 done
2325; GFX11-NEXT: s_endpgm
26+ ;
27+ ; GFX12-LABEL: v_interp_f32:
28+ ; GFX12: ; %bb.0: ; %main_body
29+ ; GFX12-NEXT: s_mov_b32 s3, exec_lo
30+ ; GFX12-NEXT: s_wqm_b32 exec_lo, exec_lo
31+ ; GFX12-NEXT: s_mov_b32 m0, s2
32+ ; GFX12-NEXT: ds_param_load v0, attr0.y wait_va_vdst:15 wait_vm_vsrc:1
33+ ; GFX12-NEXT: ds_param_load v1, attr1.x wait_va_vdst:15 wait_vm_vsrc:1
34+ ; GFX12-NEXT: s_mov_b32 exec_lo, s3
35+ ; GFX12-NEXT: v_mov_b32_e32 v2, s0
36+ ; GFX12-NEXT: v_mov_b32_e32 v4, s1
37+ ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
38+ ; GFX12-NEXT: v_interp_p10_f32 v3, v0, v2, v0 wait_exp:1
39+ ; GFX12-NEXT: v_interp_p10_f32 v2, v1, v2, v1 wait_exp:0
40+ ; GFX12-NEXT: v_interp_p2_f32 v5, v0, v4, v3 wait_exp:7
41+ ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
42+ ; GFX12-NEXT: v_interp_p2_f32 v4, v1, v4, v5 wait_exp:7
43+ ; GFX12-NEXT: export mrt0 v3, v2, v5, v4 done
44+ ; GFX12-NEXT: s_endpgm
2445main_body:
2546 %p0 = call float @llvm.amdgcn.lds.param.load (i32 1 , i32 0 , i32 %m0 )
2647 %p1 = call float @llvm.amdgcn.lds.param.load (i32 0 , i32 1 , i32 %m0 )
@@ -57,6 +78,31 @@ define amdgpu_ps void @v_interp_f32_many(float inreg %i, float inreg %j, i32 inr
5778; GFX11-NEXT: v_interp_p2_f32 v4, v3, v5, v4 wait_exp:7
5879; GFX11-NEXT: exp mrt0 v6, v7, v8, v4 done
5980; GFX11-NEXT: s_endpgm
81+ ;
82+ ; GFX12-LABEL: v_interp_f32_many:
83+ ; GFX12: ; %bb.0: ; %main_body
84+ ; GFX12-NEXT: s_mov_b32 s3, exec_lo
85+ ; GFX12-NEXT: s_wqm_b32 exec_lo, exec_lo
86+ ; GFX12-NEXT: s_mov_b32 m0, s2
87+ ; GFX12-NEXT: ds_param_load v0, attr0.x wait_va_vdst:15 wait_vm_vsrc:1
88+ ; GFX12-NEXT: ds_param_load v1, attr1.x wait_va_vdst:15 wait_vm_vsrc:1
89+ ; GFX12-NEXT: ds_param_load v2, attr2.x wait_va_vdst:15 wait_vm_vsrc:1
90+ ; GFX12-NEXT: ds_param_load v3, attr3.x wait_va_vdst:15 wait_vm_vsrc:1
91+ ; GFX12-NEXT: s_mov_b32 exec_lo, s3
92+ ; GFX12-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1
93+ ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
94+ ; GFX12-NEXT: v_interp_p10_f32 v6, v0, v4, v0 wait_exp:3
95+ ; GFX12-NEXT: v_interp_p10_f32 v7, v1, v4, v1 wait_exp:2
96+ ; GFX12-NEXT: v_interp_p10_f32 v8, v2, v4, v2 wait_exp:1
97+ ; GFX12-NEXT: v_interp_p10_f32 v4, v3, v4, v3 wait_exp:0
98+ ; GFX12-NEXT: v_interp_p2_f32 v6, v0, v5, v6 wait_exp:7
99+ ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
100+ ; GFX12-NEXT: v_interp_p2_f32 v7, v1, v5, v7 wait_exp:7
101+ ; GFX12-NEXT: v_interp_p2_f32 v8, v2, v5, v8 wait_exp:7
102+ ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4)
103+ ; GFX12-NEXT: v_interp_p2_f32 v4, v3, v5, v4 wait_exp:7
104+ ; GFX12-NEXT: export mrt0 v6, v7, v8, v4 done
105+ ; GFX12-NEXT: s_endpgm
60106main_body:
61107 %p0 = call float @llvm.amdgcn.lds.param.load (i32 0 , i32 0 , i32 %m0 )
62108 %p1 = call float @llvm.amdgcn.lds.param.load (i32 0 , i32 1 , i32 %m0 )
@@ -99,6 +145,31 @@ define amdgpu_ps void @v_interp_f32_many_vm(ptr addrspace(1) %ptr, i32 inreg %m0
99145; GFX11-NEXT: v_interp_p2_f32 v0, v5, v1, v0 wait_exp:7
100146; GFX11-NEXT: exp mrt0 v6, v7, v8, v0 done
101147; GFX11-NEXT: s_endpgm
148+ ;
149+ ; GFX12-LABEL: v_interp_f32_many_vm:
150+ ; GFX12: ; %bb.0: ; %main_body
151+ ; GFX12-NEXT: global_load_b64 v[0:1], v[0:1], off offset:4
152+ ; GFX12-NEXT: s_mov_b32 m0, s0
153+ ; GFX12-NEXT: s_mov_b32 s0, exec_lo
154+ ; GFX12-NEXT: s_wqm_b32 exec_lo, exec_lo
155+ ; GFX12-NEXT: ds_param_load v2, attr0.x wait_va_vdst:15 wait_vm_vsrc:1
156+ ; GFX12-NEXT: ds_param_load v3, attr1.x wait_va_vdst:15 wait_vm_vsrc:1
157+ ; GFX12-NEXT: ds_param_load v4, attr2.x wait_va_vdst:15 wait_vm_vsrc:1
158+ ; GFX12-NEXT: ds_param_load v5, attr3.x wait_va_vdst:15 wait_vm_vsrc:1
159+ ; GFX12-NEXT: s_mov_b32 exec_lo, s0
160+ ; GFX12-NEXT: s_wait_loadcnt 0x0
161+ ; GFX12-NEXT: v_interp_p10_f32 v6, v2, v0, v2 wait_exp:3
162+ ; GFX12-NEXT: v_interp_p10_f32 v7, v3, v0, v3 wait_exp:2
163+ ; GFX12-NEXT: v_interp_p10_f32 v8, v4, v0, v4 wait_exp:1
164+ ; GFX12-NEXT: v_interp_p10_f32 v0, v5, v0, v5 wait_exp:0
165+ ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
166+ ; GFX12-NEXT: v_interp_p2_f32 v6, v2, v1, v6 wait_exp:7
167+ ; GFX12-NEXT: v_interp_p2_f32 v7, v3, v1, v7 wait_exp:7
168+ ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
169+ ; GFX12-NEXT: v_interp_p2_f32 v8, v4, v1, v8 wait_exp:7
170+ ; GFX12-NEXT: v_interp_p2_f32 v0, v5, v1, v0 wait_exp:7
171+ ; GFX12-NEXT: export mrt0 v6, v7, v8, v0 done
172+ ; GFX12-NEXT: s_endpgm
102173main_body:
103174 %i.ptr = getelementptr float , ptr addrspace (1 ) %ptr , i32 1
104175 %i = load float , ptr addrspace (1 ) %i.ptr , align 4
@@ -156,6 +227,42 @@ define amdgpu_ps half @v_interp_f16(float inreg %i, float inreg %j, i32 inreg %m
156227; GFX11-FAKE16-NEXT: v_interp_p2_f16_f32 v0, v1, v2, v0 op_sel:[1,0,0,0] wait_exp:7
157228; GFX11-FAKE16-NEXT: v_add_f16_e32 v0, v3, v0
158229; GFX11-FAKE16-NEXT: ; return to shader part epilog
230+ ;
231+ ; GFX12-TRUE16-LABEL: v_interp_f16:
232+ ; GFX12-TRUE16: ; %bb.0: ; %main_body
233+ ; GFX12-TRUE16-NEXT: s_mov_b32 s3, exec_lo
234+ ; GFX12-TRUE16-NEXT: s_wqm_b32 exec_lo, exec_lo
235+ ; GFX12-TRUE16-NEXT: s_mov_b32 m0, s2
236+ ; GFX12-TRUE16-NEXT: ds_param_load v1, attr0.x wait_va_vdst:15 wait_vm_vsrc:1
237+ ; GFX12-TRUE16-NEXT: s_mov_b32 exec_lo, s3
238+ ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, s0
239+ ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, s1
240+ ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
241+ ; GFX12-TRUE16-NEXT: v_interp_p10_f16_f32 v3, v1.l, v0, v1.l wait_exp:0
242+ ; GFX12-TRUE16-NEXT: v_interp_p10_f16_f32 v4, v1.h, v0, v1.h wait_exp:7
243+ ; GFX12-TRUE16-NEXT: v_interp_p2_f16_f32 v0.l, v1.l, v2, v3 wait_exp:7
244+ ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
245+ ; GFX12-TRUE16-NEXT: v_interp_p2_f16_f32 v0.h, v1.h, v2, v4 wait_exp:7
246+ ; GFX12-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h
247+ ; GFX12-TRUE16-NEXT: ; return to shader part epilog
248+ ;
249+ ; GFX12-FAKE16-LABEL: v_interp_f16:
250+ ; GFX12-FAKE16: ; %bb.0: ; %main_body
251+ ; GFX12-FAKE16-NEXT: s_mov_b32 s3, exec_lo
252+ ; GFX12-FAKE16-NEXT: s_wqm_b32 exec_lo, exec_lo
253+ ; GFX12-FAKE16-NEXT: s_mov_b32 m0, s2
254+ ; GFX12-FAKE16-NEXT: ds_param_load v1, attr0.x wait_va_vdst:15 wait_vm_vsrc:1
255+ ; GFX12-FAKE16-NEXT: s_mov_b32 exec_lo, s3
256+ ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, s0
257+ ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, s1
258+ ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
259+ ; GFX12-FAKE16-NEXT: v_interp_p10_f16_f32 v3, v1, v0, v1 wait_exp:0
260+ ; GFX12-FAKE16-NEXT: v_interp_p10_f16_f32 v0, v1, v0, v1 op_sel:[1,0,1,0] wait_exp:7
261+ ; GFX12-FAKE16-NEXT: v_interp_p2_f16_f32 v3, v1, v2, v3 wait_exp:7
262+ ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
263+ ; GFX12-FAKE16-NEXT: v_interp_p2_f16_f32 v0, v1, v2, v0 op_sel:[1,0,0,0] wait_exp:7
264+ ; GFX12-FAKE16-NEXT: v_add_f16_e32 v0, v3, v0
265+ ; GFX12-FAKE16-NEXT: ; return to shader part epilog
159266main_body:
160267 %p0 = call float @llvm.amdgcn.lds.param.load (i32 0 , i32 0 , i32 %m0 )
161268 %l_p0 = call float @llvm.amdgcn.interp.inreg.p10.f16 (float %p0 , float %i , float %p0 , i1 0 )
@@ -202,6 +309,42 @@ define amdgpu_ps half @v_interp_rtz_f16(float inreg %i, float inreg %j, i32 inre
202309; GFX11-FAKE16-NEXT: v_interp_p2_rtz_f16_f32 v0, v1, v2, v0 op_sel:[1,0,0,0] wait_exp:7
203310; GFX11-FAKE16-NEXT: v_add_f16_e32 v0, v3, v0
204311; GFX11-FAKE16-NEXT: ; return to shader part epilog
312+ ;
313+ ; GFX12-TRUE16-LABEL: v_interp_rtz_f16:
314+ ; GFX12-TRUE16: ; %bb.0: ; %main_body
315+ ; GFX12-TRUE16-NEXT: s_mov_b32 s3, exec_lo
316+ ; GFX12-TRUE16-NEXT: s_wqm_b32 exec_lo, exec_lo
317+ ; GFX12-TRUE16-NEXT: s_mov_b32 m0, s2
318+ ; GFX12-TRUE16-NEXT: ds_param_load v1, attr0.x wait_va_vdst:15 wait_vm_vsrc:1
319+ ; GFX12-TRUE16-NEXT: s_mov_b32 exec_lo, s3
320+ ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, s0
321+ ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, s1
322+ ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
323+ ; GFX12-TRUE16-NEXT: v_interp_p10_rtz_f16_f32 v3, v1.l, v0, v1.l wait_exp:0
324+ ; GFX12-TRUE16-NEXT: v_interp_p10_rtz_f16_f32 v4, v1.h, v0, v1.h wait_exp:7
325+ ; GFX12-TRUE16-NEXT: v_interp_p2_rtz_f16_f32 v0.l, v1.l, v2, v3 wait_exp:7
326+ ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
327+ ; GFX12-TRUE16-NEXT: v_interp_p2_rtz_f16_f32 v0.h, v1.h, v2, v4 wait_exp:7
328+ ; GFX12-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h
329+ ; GFX12-TRUE16-NEXT: ; return to shader part epilog
330+ ;
331+ ; GFX12-FAKE16-LABEL: v_interp_rtz_f16:
332+ ; GFX12-FAKE16: ; %bb.0: ; %main_body
333+ ; GFX12-FAKE16-NEXT: s_mov_b32 s3, exec_lo
334+ ; GFX12-FAKE16-NEXT: s_wqm_b32 exec_lo, exec_lo
335+ ; GFX12-FAKE16-NEXT: s_mov_b32 m0, s2
336+ ; GFX12-FAKE16-NEXT: ds_param_load v1, attr0.x wait_va_vdst:15 wait_vm_vsrc:1
337+ ; GFX12-FAKE16-NEXT: s_mov_b32 exec_lo, s3
338+ ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, s0
339+ ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, s1
340+ ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
341+ ; GFX12-FAKE16-NEXT: v_interp_p10_rtz_f16_f32 v3, v1, v0, v1 wait_exp:0
342+ ; GFX12-FAKE16-NEXT: v_interp_p10_rtz_f16_f32 v0, v1, v0, v1 op_sel:[1,0,1,0] wait_exp:7
343+ ; GFX12-FAKE16-NEXT: v_interp_p2_rtz_f16_f32 v3, v1, v2, v3 wait_exp:7
344+ ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
345+ ; GFX12-FAKE16-NEXT: v_interp_p2_rtz_f16_f32 v0, v1, v2, v0 op_sel:[1,0,0,0] wait_exp:7
346+ ; GFX12-FAKE16-NEXT: v_add_f16_e32 v0, v3, v0
347+ ; GFX12-FAKE16-NEXT: ; return to shader part epilog
205348main_body:
206349 %p0 = call float @llvm.amdgcn.lds.param.load (i32 0 , i32 0 , i32 %m0 )
207350 %l_p0 = call float @llvm.amdgcn.interp.p10.rtz.f16 (float %p0 , float %i , float %p0 , i1 0 )
@@ -237,6 +380,31 @@ define amdgpu_ps half @v_interp_f16_imm_params(float inreg %i, float inreg %j) #
237380; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v1, v1
238381; GFX11-FAKE16-NEXT: v_add_f16_e32 v0, v1, v0
239382; GFX11-FAKE16-NEXT: ; return to shader part epilog
383+ ;
384+ ; GFX12-TRUE16-LABEL: v_interp_f16_imm_params:
385+ ; GFX12-TRUE16: ; %bb.0: ; %main_body
386+ ; GFX12-TRUE16-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, 0
387+ ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0
388+ ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, s1
389+ ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
390+ ; GFX12-TRUE16-NEXT: v_interp_p10_f16_f32 v1, v0.l, v1, v0.l wait_exp:7
391+ ; GFX12-TRUE16-NEXT: v_interp_p2_f16_f32 v0.l, v0.l, v3, v2 wait_exp:7
392+ ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
393+ ; GFX12-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v1
394+ ; GFX12-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.h, v0.l
395+ ; GFX12-TRUE16-NEXT: ; return to shader part epilog
396+ ;
397+ ; GFX12-FAKE16-LABEL: v_interp_f16_imm_params:
398+ ; GFX12-FAKE16: ; %bb.0: ; %main_body
399+ ; GFX12-FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
400+ ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, s1
401+ ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
402+ ; GFX12-FAKE16-NEXT: v_interp_p10_f16_f32 v1, v0, v1, v0 wait_exp:7
403+ ; GFX12-FAKE16-NEXT: v_interp_p2_f16_f32 v0, v0, v2, v0 wait_exp:7
404+ ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
405+ ; GFX12-FAKE16-NEXT: v_cvt_f16_f32_e32 v1, v1
406+ ; GFX12-FAKE16-NEXT: v_add_f16_e32 v0, v1, v0
407+ ; GFX12-FAKE16-NEXT: ; return to shader part epilog
240408main_body:
241409 %l_p0 = call float @llvm.amdgcn.interp.inreg.p10.f16 (float 0 .0 , float %i , float 0 .0 , i1 0 )
242410 %l_p1 = call half @llvm.amdgcn.interp.inreg.p2.f16 (float 0 .0 , float %j , float 0 .0 , i1 0 )
0 commit comments