44define amdgpu_ps void @group_image_sample (i32 inreg noundef %globalTable , i32 inreg noundef %userdata6 , i32 inreg noundef %userdata7 , i32 inreg noundef %userdata8 , i32 inreg noundef %PrimMask , <2 x float > noundef %PerspInterpSample , <2 x float > noundef %PerspInterpCenter , <2 x float > noundef %PerspInterpCentroid ) #2 {
55; GFX11-LABEL: group_image_sample:
66; GFX11: ; %bb.0: ; %.entry
7- ; GFX11-NEXT: s_mov_b32 s24 , exec_lo
7+ ; GFX11-NEXT: s_mov_b32 s33 , exec_lo
88; GFX11-NEXT: s_wqm_b32 exec_lo, exec_lo
99; GFX11-NEXT: s_mov_b32 m0, s4
1010; GFX11-NEXT: s_getpc_b64 s[4:5]
@@ -21,73 +21,79 @@ define amdgpu_ps void @group_image_sample(i32 inreg noundef %globalTable, i32 in
2121; GFX11-NEXT: lds_param_load v2, attr0.y wait_vdst:15
2222; GFX11-NEXT: lds_param_load v3, attr0.x wait_vdst:15
2323; GFX11-NEXT: s_mov_b32 exec_lo, s16
24+ ; GFX11-NEXT: v_interp_p10_f32 v4, v2, v0, v2 wait_exp:1
25+ ; GFX11-NEXT: v_interp_p10_f32 v0, v3, v0, v3 wait_exp:0
2426; GFX11-NEXT: s_waitcnt lgkmcnt(0)
25- ; GFX11-NEXT: s_clause 0x3
27+ ; GFX11-NEXT: s_clause 0xf
2628; GFX11-NEXT: s_buffer_load_b64 s[16:17], s[12:15], 0x10
2729; GFX11-NEXT: s_buffer_load_b64 s[18:19], s[12:15], 0x20
2830; GFX11-NEXT: s_buffer_load_b64 s[20:21], s[12:15], 0x30
2931; GFX11-NEXT: s_buffer_load_b64 s[22:23], s[12:15], 0x40
30- ; GFX11-NEXT: v_interp_p10_f32 v4, v2, v0, v2 wait_exp:1
31- ; GFX11-NEXT: v_interp_p10_f32 v0, v3, v0, v3 wait_exp:0
32- ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
32+ ; GFX11-NEXT: s_buffer_load_b64 s[24:25], s[12:15], 0x50
33+ ; GFX11-NEXT: s_buffer_load_b64 s[26:27], s[12:15], 0x60
34+ ; GFX11-NEXT: s_buffer_load_b64 s[28:29], s[12:15], 0x70
35+ ; GFX11-NEXT: s_buffer_load_b64 s[30:31], s[12:15], 0x80
36+ ; GFX11-NEXT: s_buffer_load_b64 s[34:35], s[12:15], 0x90
37+ ; GFX11-NEXT: s_buffer_load_b64 s[36:37], s[12:15], 0xa0
38+ ; GFX11-NEXT: s_buffer_load_b64 s[38:39], s[12:15], 0xb0
39+ ; GFX11-NEXT: s_buffer_load_b64 s[40:41], s[12:15], 0xc0
40+ ; GFX11-NEXT: s_buffer_load_b64 s[42:43], s[12:15], 0xd0
41+ ; GFX11-NEXT: s_buffer_load_b64 s[44:45], s[12:15], 0xe0
42+ ; GFX11-NEXT: s_buffer_load_b64 s[46:47], s[12:15], 0xf0
43+ ; GFX11-NEXT: s_buffer_load_b64 s[12:13], s[12:15], 0x100
44+ ; GFX11-NEXT: v_interp_p2_f32 v36, v2, v1, v4 wait_exp:7
3345; GFX11-NEXT: v_interp_p2_f32 v0, v3, v1, v0 wait_exp:7
34- ; GFX11-NEXT: v_interp_p2_f32 v1, v2, v1, v4 wait_exp:7
3546; GFX11-NEXT: s_waitcnt lgkmcnt(0)
36- ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
37- ; GFX11-NEXT: v_dual_add_f32 v4, s16, v0 :: v_dual_add_f32 v5, s17, v1
38- ; GFX11-NEXT: v_dual_add_f32 v12, s20, v0 :: v_dual_add_f32 v13, s21, v1
39- ; GFX11-NEXT: v_dual_add_f32 v8, s18, v0 :: v_dual_add_f32 v9, s19, v1
40- ; GFX11-NEXT: v_dual_add_f32 v16, s22, v0 :: v_dual_add_f32 v17, s23, v1
41- ; GFX11-NEXT: s_clause 0x3
47+ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
48+ ; GFX11-NEXT: v_add_f32_e32 v5, s17, v36
49+ ; GFX11-NEXT: v_add_f32_e32 v4, s16, v0
50+ ; GFX11-NEXT: v_add_f32_e32 v8, s18, v0
51+ ; GFX11-NEXT: v_add_f32_e32 v9, s19, v36
52+ ; GFX11-NEXT: v_add_f32_e32 v12, s20, v0
53+ ; GFX11-NEXT: v_add_f32_e32 v13, s21, v36
54+ ; GFX11-NEXT: v_add_f32_e32 v16, s22, v0
55+ ; GFX11-NEXT: v_add_f32_e32 v17, s23, v36
56+ ; GFX11-NEXT: v_add_f32_e32 v20, s24, v0
57+ ; GFX11-NEXT: v_add_f32_e32 v21, s25, v36
58+ ; GFX11-NEXT: v_add_f32_e32 v24, s26, v0
59+ ; GFX11-NEXT: v_add_f32_e32 v25, s27, v36
60+ ; GFX11-NEXT: v_add_f32_e32 v28, s28, v0
61+ ; GFX11-NEXT: v_add_f32_e32 v29, s29, v36
62+ ; GFX11-NEXT: v_add_f32_e32 v32, s30, v0
63+ ; GFX11-NEXT: v_add_f32_e32 v33, s31, v36
64+ ; GFX11-NEXT: s_clause 0x7
4265; GFX11-NEXT: image_sample v[4:7], v[4:5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
4366; GFX11-NEXT: image_sample v[8:11], v[8:9], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
4467; GFX11-NEXT: image_sample v[12:15], v[12:13], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
4568; GFX11-NEXT: image_sample v[16:19], v[16:17], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
46- ; GFX11-NEXT: s_clause 0x3
47- ; GFX11-NEXT: s_buffer_load_b64 s[16:17], s[12:15], 0x50
48- ; GFX11-NEXT: s_buffer_load_b64 s[18:19], s[12:15], 0x60
49- ; GFX11-NEXT: s_buffer_load_b64 s[20:21], s[12:15], 0x70
50- ; GFX11-NEXT: s_buffer_load_b64 s[22:23], s[12:15], 0x80
51- ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
52- ; GFX11-NEXT: v_dual_add_f32 v20, s16, v0 :: v_dual_add_f32 v21, s17, v1
53- ; GFX11-NEXT: v_dual_add_f32 v28, s20, v0 :: v_dual_add_f32 v29, s21, v1
54- ; GFX11-NEXT: v_dual_add_f32 v24, s18, v0 :: v_dual_add_f32 v25, s19, v1
55- ; GFX11-NEXT: v_dual_add_f32 v32, s22, v0 :: v_dual_add_f32 v33, s23, v1
56- ; GFX11-NEXT: s_clause 0x3
5769; GFX11-NEXT: image_sample v[20:23], v[20:21], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
5870; GFX11-NEXT: image_sample v[24:27], v[24:25], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
5971; GFX11-NEXT: image_sample v[28:31], v[28:29], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
6072; GFX11-NEXT: image_sample v[32:35], v[32:33], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
61- ; GFX11-NEXT: s_clause 0x3
62- ; GFX11-NEXT: s_buffer_load_b64 s[16:17], s[12:15], 0x90
63- ; GFX11-NEXT: s_buffer_load_b64 s[18:19], s[12:15], 0xa0
64- ; GFX11-NEXT: s_buffer_load_b64 s[20:21], s[12:15], 0xb0
65- ; GFX11-NEXT: s_buffer_load_b64 s[22:23], s[12:15], 0xc0
66- ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
67- ; GFX11-NEXT: v_dual_add_f32 v36, s16, v0 :: v_dual_add_f32 v37, s17, v1
68- ; GFX11-NEXT: v_dual_add_f32 v44, s20, v0 :: v_dual_add_f32 v45, s21, v1
69- ; GFX11-NEXT: v_dual_add_f32 v40, s18, v0 :: v_dual_add_f32 v41, s19, v1
70- ; GFX11-NEXT: v_dual_add_f32 v48, s22, v0 :: v_dual_add_f32 v49, s23, v1
71- ; GFX11-NEXT: s_clause 0x3
72- ; GFX11-NEXT: image_sample v[36:39], v[36:37], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
73+ ; GFX11-NEXT: v_add_f32_e32 v37, s34, v0
74+ ; GFX11-NEXT: v_add_f32_e32 v38, s35, v36
75+ ; GFX11-NEXT: v_add_f32_e32 v40, s36, v0
76+ ; GFX11-NEXT: v_add_f32_e32 v41, s37, v36
77+ ; GFX11-NEXT: v_add_f32_e32 v44, s38, v0
78+ ; GFX11-NEXT: v_add_f32_e32 v45, s39, v36
79+ ; GFX11-NEXT: v_add_f32_e32 v48, s40, v0
80+ ; GFX11-NEXT: v_add_f32_e32 v49, s41, v36
81+ ; GFX11-NEXT: v_add_f32_e32 v52, s42, v0
82+ ; GFX11-NEXT: v_add_f32_e32 v53, s43, v36
83+ ; GFX11-NEXT: v_add_f32_e32 v56, s44, v0
84+ ; GFX11-NEXT: v_add_f32_e32 v57, s45, v36
85+ ; GFX11-NEXT: v_add_f32_e32 v60, s46, v0
86+ ; GFX11-NEXT: v_add_f32_e32 v61, s47, v36
87+ ; GFX11-NEXT: v_add_f32_e32 v0, s12, v0
88+ ; GFX11-NEXT: v_add_f32_e32 v1, s13, v36
89+ ; GFX11-NEXT: s_and_b32 exec_lo, exec_lo, s33
90+ ; GFX11-NEXT: s_clause 0x7
91+ ; GFX11-NEXT: image_sample v[36:39], v[37:38], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
7392; GFX11-NEXT: image_sample v[40:43], v[40:41], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
7493; GFX11-NEXT: image_sample v[44:47], v[44:45], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
7594; GFX11-NEXT: image_sample v[48:51], v[48:49], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
76- ; GFX11-NEXT: s_clause 0x3
77- ; GFX11-NEXT: s_buffer_load_b64 s[16:17], s[12:15], 0xd0
78- ; GFX11-NEXT: s_buffer_load_b64 s[18:19], s[12:15], 0xe0
79- ; GFX11-NEXT: s_buffer_load_b64 s[20:21], s[12:15], 0xf0
80- ; GFX11-NEXT: s_buffer_load_b64 s[12:13], s[12:15], 0x100
81- ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
82- ; GFX11-NEXT: v_dual_add_f32 v52, s16, v0 :: v_dual_add_f32 v53, s17, v1
83- ; GFX11-NEXT: v_dual_add_f32 v56, s18, v0 :: v_dual_add_f32 v57, s19, v1
84- ; GFX11-NEXT: s_clause 0x1
8595; GFX11-NEXT: image_sample v[52:55], v[52:53], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
8696; GFX11-NEXT: image_sample v[56:59], v[56:57], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
87- ; GFX11-NEXT: v_dual_add_f32 v60, s20, v0 :: v_dual_add_f32 v61, s21, v1
88- ; GFX11-NEXT: v_dual_add_f32 v0, s12, v0 :: v_dual_add_f32 v1, s13, v1
89- ; GFX11-NEXT: s_and_b32 exec_lo, exec_lo, s24
90- ; GFX11-NEXT: s_clause 0x1
9197; GFX11-NEXT: image_sample v[60:63], v[60:61], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
9298; GFX11-NEXT: image_sample v[64:67], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
9399; GFX11-NEXT: s_waitcnt vmcnt(14)
@@ -446,7 +452,7 @@ declare float @llvm.amdgcn.interp.inreg.p10(float, float, float) #3
446452declare float @llvm.amdgcn.interp.inreg.p2 (float , float , float ) #3
447453declare <2 x i32 > @llvm.amdgcn.s.buffer.load.v2i32 (<4 x i32 >, i32 , i32 immarg) #8
448454
449- attributes #2 = { alwaysinline nounwind memory(readwrite) "amdgpu-sched-strategy" ="max-memory-clause" }
455+ attributes #2 = { alwaysinline nounwind memory(readwrite) "amdgpu-sched-strategy" ="max-memory-clause" "amdgpu-max-memory-cluster-dwords" = "32" }
450456attributes #3 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
451457attributes #4 = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) }
452458attributes #5 = { nocallback nofree nosync nounwind willreturn memory(read) }
0 commit comments