@@ -14,7 +14,7 @@ define i32 @static_alloca() {
1414; ISEL-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
1515; ISEL-NEXT: s_mov_b64 exec, s[18:19]
1616; ISEL-NEXT: s_addk_i32 s32, 0x400
17- ; ISEL-NEXT: v_writelane_b32 v40, s16, 4
17+ ; ISEL-NEXT: v_writelane_b32 v40, s16, 3
1818; ISEL-NEXT: s_getpc_b64 s[16:17]
1919; ISEL-NEXT: s_add_u32 s16, s16, bar@rel32@lo+4
2020; ISEL-NEXT: s_addc_u32 s17, s17, bar@rel32@hi+12
@@ -27,25 +27,22 @@ define i32 @static_alloca() {
2727; ISEL-NEXT: v_writelane_b32 v40, s34, 2
2828; ISEL-NEXT: s_cselect_b32 s34, s18, 0
2929; ISEL-NEXT: s_mov_b64 s[18:19], src_private_base
30- ; ISEL-NEXT: v_writelane_b32 v40, s35, 3
31- ; ISEL-NEXT: s_cselect_b32 s35, s19, 0
30+ ; ISEL-NEXT: s_cselect_b32 s18, s19, 0
3231; ISEL-NEXT: v_mov_b32_e32 v0, s34
33- ; ISEL-NEXT: v_mov_b32_e32 v1, s35
32+ ; ISEL-NEXT: v_mov_b32_e32 v1, s18
3433; ISEL-NEXT: s_swappc_b64 s[30:31], s[16:17]
3534; ISEL-NEXT: v_mov_b32_e32 v0, s34
36- ; ISEL-NEXT: v_mov_b32_e32 v1, s35
37- ; ISEL-NEXT: flat_load_dword v0, v[0:1]
38- ; ISEL-NEXT: v_readlane_b32 s35, v40, 3
35+ ; ISEL-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
3936; ISEL-NEXT: v_readlane_b32 s34, v40, 2
4037; ISEL-NEXT: v_readlane_b32 s31, v40, 1
4138; ISEL-NEXT: v_readlane_b32 s30, v40, 0
4239; ISEL-NEXT: s_mov_b32 s32, s33
43- ; ISEL-NEXT: v_readlane_b32 s4, v40, 4
40+ ; ISEL-NEXT: v_readlane_b32 s4, v40, 3
4441; ISEL-NEXT: s_or_saveexec_b64 s[6:7], -1
4542; ISEL-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
4643; ISEL-NEXT: s_mov_b64 exec, s[6:7]
4744; ISEL-NEXT: s_mov_b32 s33, s4
48- ; ISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
45+ ; ISEL-NEXT: s_waitcnt vmcnt(0)
4946; ISEL-NEXT: s_setpc_b64 s[30:31]
5047;
5148; GI-LABEL: static_alloca:
@@ -56,35 +53,27 @@ define i32 @static_alloca() {
5653; GI-NEXT: s_or_saveexec_b64 s[18:19], -1
5754; GI-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
5855; GI-NEXT: s_mov_b64 exec, s[18:19]
59- ; GI-NEXT: v_writelane_b32 v40, s16, 4
60- ; GI-NEXT: v_writelane_b32 v40, s30, 0
61- ; GI-NEXT: v_writelane_b32 v40, s31, 1
56+ ; GI-NEXT: v_writelane_b32 v40, s16, 2
6257; GI-NEXT: s_addk_i32 s32, 0x400
63- ; GI-NEXT: v_writelane_b32 v40, s34, 2
64- ; GI-NEXT: s_lshr_b32 s34, s33, 6
6558; GI-NEXT: s_mov_b64 s[16:17], src_private_base
59+ ; GI-NEXT: v_writelane_b32 v40, s30, 0
6660; GI-NEXT: s_getpc_b64 s[18:19]
6761; GI-NEXT: s_add_u32 s18, s18, bar@rel32@lo+4
6862; GI-NEXT: s_addc_u32 s19, s19, bar@rel32@hi+12
6963; GI-NEXT: v_lshrrev_b32_e64 v0, 6, s33
7064; GI-NEXT: v_mov_b32_e32 v1, s17
71- ; GI-NEXT: v_writelane_b32 v40, s35, 3
72- ; GI-NEXT: s_mov_b32 s35, s17
65+ ; GI-NEXT: v_writelane_b32 v40, s31, 1
7366; GI-NEXT: s_swappc_b64 s[30:31], s[18:19]
74- ; GI-NEXT: v_mov_b32_e32 v0, s34
75- ; GI-NEXT: v_mov_b32_e32 v1, s35
76- ; GI-NEXT: flat_load_dword v0, v[0:1]
77- ; GI-NEXT: v_readlane_b32 s35, v40, 3
78- ; GI-NEXT: v_readlane_b32 s34, v40, 2
67+ ; GI-NEXT: buffer_load_dword v0, off, s[0:3], s33
7968; GI-NEXT: v_readlane_b32 s31, v40, 1
8069; GI-NEXT: v_readlane_b32 s30, v40, 0
8170; GI-NEXT: s_mov_b32 s32, s33
82- ; GI-NEXT: v_readlane_b32 s4, v40, 4
71+ ; GI-NEXT: v_readlane_b32 s4, v40, 2
8372; GI-NEXT: s_or_saveexec_b64 s[6:7], -1
8473; GI-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
8574; GI-NEXT: s_mov_b64 exec, s[6:7]
8675; GI-NEXT: s_mov_b32 s33, s4
87- ; GI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
76+ ; GI-NEXT: s_waitcnt vmcnt(0)
8877; GI-NEXT: s_setpc_b64 s[30:31]
8978 %alloca = alloca i32 , align 4
9079 call void @bar (ptr %alloca )
@@ -112,19 +101,18 @@ define amdgpu_kernel void @static_alloca_kernel(ptr %p) {
112101; ISEL-NEXT: v_lshlrev_b32_e32 v2, 20, v2
113102; ISEL-NEXT: v_lshlrev_b32_e32 v1, 10, v1
114103; ISEL-NEXT: s_cselect_b32 s33, 0, 0
115- ; ISEL-NEXT: s_cselect_b32 s36 , s15, 0
104+ ; ISEL-NEXT: s_cselect_b32 s15 , s15, 0
116105; ISEL-NEXT: v_or3_b32 v31, v0, v1, v2
117106; ISEL-NEXT: s_mov_b32 s14, s16
118107; ISEL-NEXT: v_mov_b32_e32 v0, s33
119- ; ISEL-NEXT: v_mov_b32_e32 v1, s36
108+ ; ISEL-NEXT: v_mov_b32_e32 v1, s15
120109; ISEL-NEXT: s_movk_i32 s32, 0x400
121110; ISEL-NEXT: s_swappc_b64 s[30:31], s[18:19]
122111; ISEL-NEXT: v_mov_b32_e32 v0, s33
123- ; ISEL-NEXT: v_mov_b32_e32 v1, s36
124- ; ISEL-NEXT: flat_load_dword v2, v[0:1]
112+ ; ISEL-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen
125113; ISEL-NEXT: v_mov_b32_e32 v0, s34
126114; ISEL-NEXT: v_mov_b32_e32 v1, s35
127- ; ISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
115+ ; ISEL-NEXT: s_waitcnt vmcnt(0)
128116; ISEL-NEXT: flat_store_dword v[0:1], v2
129117; ISEL-NEXT: s_endpgm
130118;
@@ -138,10 +126,10 @@ define amdgpu_kernel void @static_alloca_kernel(ptr %p) {
138126; GI-NEXT: s_add_u32 s8, s8, 8
139127; GI-NEXT: s_mov_b32 s13, s15
140128; GI-NEXT: s_mov_b32 s12, s14
129+ ; GI-NEXT: s_mov_b64 s[14:15], src_private_base
141130; GI-NEXT: s_addc_u32 s9, s9, 0
142131; GI-NEXT: v_lshlrev_b32_e32 v1, 10, v1
143132; GI-NEXT: v_lshlrev_b32_e32 v2, 20, v2
144- ; GI-NEXT: s_mov_b64 s[14:15], src_private_base
145133; GI-NEXT: v_or3_b32 v31, v0, v1, v2
146134; GI-NEXT: s_getpc_b64 s[18:19]
147135; GI-NEXT: s_add_u32 s18, s18, bar@rel32@lo+4
@@ -150,15 +138,11 @@ define amdgpu_kernel void @static_alloca_kernel(ptr %p) {
150138; GI-NEXT: v_mov_b32_e32 v1, s15
151139; GI-NEXT: s_mov_b32 s14, s16
152140; GI-NEXT: s_movk_i32 s32, 0x400
153- ; GI-NEXT: s_mov_b32 s36, 0
154- ; GI-NEXT: s_mov_b32 s37, s15
155141; GI-NEXT: s_swappc_b64 s[30:31], s[18:19]
156- ; GI-NEXT: v_mov_b32_e32 v0, s36
157- ; GI-NEXT: v_mov_b32_e32 v1, s37
158- ; GI-NEXT: flat_load_dword v2, v[0:1]
142+ ; GI-NEXT: buffer_load_dword v2, off, s[0:3], 0
159143; GI-NEXT: v_mov_b32_e32 v0, s34
160144; GI-NEXT: v_mov_b32_e32 v1, s35
161- ; GI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
145+ ; GI-NEXT: s_waitcnt vmcnt(0)
162146; GI-NEXT: flat_store_dword v[0:1], v2
163147; GI-NEXT: s_endpgm
164148 %alloca = alloca i32 , align 4
@@ -279,24 +263,24 @@ define amdgpu_kernel void @dynamic_alloca_i32_kernel(i32 %n, ptr %p) {
279263; ISEL-LABEL: dynamic_alloca_i32_kernel:
280264; ISEL: ; %bb.0:
281265; ISEL-NEXT: s_add_u32 flat_scratch_lo, s12, s17
266+ ; ISEL-NEXT: s_mov_b32 s12, s14
267+ ; ISEL-NEXT: s_load_dword s14, s[8:9], 0x0
268+ ; ISEL-NEXT: s_load_dwordx2 s[34:35], s[8:9], 0x8
282269; ISEL-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
283270; ISEL-NEXT: s_add_u32 s0, s0, s17
284- ; ISEL-NEXT: s_load_dword s17, s[8:9], 0x0
285- ; ISEL-NEXT: s_load_dwordx2 s[34:35], s[8:9], 0x8
286- ; ISEL-NEXT: s_movk_i32 s32, 0x400
287271; ISEL-NEXT: s_addc_u32 s1, s1, 0
288- ; ISEL-NEXT: s_mov_b32 s13, s15
289- ; ISEL-NEXT: s_mov_b32 s12, s14
290- ; ISEL-NEXT: s_mov_b64 s[14:15], src_private_base
291- ; ISEL-NEXT: s_cmp_lg_u32 s32, -1
292- ; ISEL-NEXT: s_cselect_b32 s15, s15, 0
293- ; ISEL-NEXT: s_cselect_b32 s20, s32, 0
294272; ISEL-NEXT: s_waitcnt lgkmcnt(0)
295- ; ISEL-NEXT: s_lshl_b32 s14, s17 , 2
273+ ; ISEL-NEXT: s_lshl_b32 s14, s14 , 2
296274; ISEL-NEXT: s_add_i32 s14, s14, 15
297275; ISEL-NEXT: s_and_b32 s14, s14, -16
276+ ; ISEL-NEXT: s_movk_i32 s32, 0x400
298277; ISEL-NEXT: s_lshl_b32 s14, s14, 6
299- ; ISEL-NEXT: s_add_i32 s32, s32, s14
278+ ; ISEL-NEXT: s_add_i32 s17, s32, s14
279+ ; ISEL-NEXT: s_mov_b32 s13, s15
280+ ; ISEL-NEXT: s_cmp_lg_u32 s32, -1
281+ ; ISEL-NEXT: s_mov_b64 s[14:15], src_private_base
282+ ; ISEL-NEXT: s_cselect_b32 s36, s32, 0
283+ ; ISEL-NEXT: s_cselect_b32 s15, s15, 0
300284; ISEL-NEXT: s_add_u32 s8, s8, 16
301285; ISEL-NEXT: s_addc_u32 s9, s9, 0
302286; ISEL-NEXT: v_lshlrev_b32_e32 v2, 20, v2
@@ -306,16 +290,16 @@ define amdgpu_kernel void @dynamic_alloca_i32_kernel(i32 %n, ptr %p) {
306290; ISEL-NEXT: s_addc_u32 s19, s19, bar@rel32@hi+12
307291; ISEL-NEXT: v_or3_b32 v31, v0, v1, v2
308292; ISEL-NEXT: s_mov_b32 s14, s16
309- ; ISEL-NEXT: v_mov_b32_e32 v0, s20
293+ ; ISEL-NEXT: v_mov_b32_e32 v0, s36
310294; ISEL-NEXT: v_mov_b32_e32 v1, s15
311295; ISEL-NEXT: s_mov_b32 s33, 0
312- ; ISEL-NEXT: v_mov_b32_e32 v40, s20
313- ; ISEL-NEXT: v_mov_b32_e32 v41, s15
296+ ; ISEL-NEXT: s_mov_b32 s32, s17
314297; ISEL-NEXT: s_swappc_b64 s[30:31], s[18:19]
315- ; ISEL-NEXT: flat_load_dword v2, v[40:41]
298+ ; ISEL-NEXT: v_mov_b32_e32 v0, s36
299+ ; ISEL-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen
316300; ISEL-NEXT: v_mov_b32_e32 v0, s34
317301; ISEL-NEXT: v_mov_b32_e32 v1, s35
318- ; ISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
302+ ; ISEL-NEXT: s_waitcnt vmcnt(0)
319303; ISEL-NEXT: flat_store_dword v[0:1], v2
320304; ISEL-NEXT: s_endpgm
321305;
@@ -356,11 +340,10 @@ define amdgpu_kernel void @dynamic_alloca_i32_kernel(i32 %n, ptr %p) {
356340; GI-NEXT: s_mov_b32 s33, 0
357341; GI-NEXT: s_swappc_b64 s[30:31], s[18:19]
358342; GI-NEXT: v_mov_b32_e32 v0, s36
359- ; GI-NEXT: v_mov_b32_e32 v1, s37
360- ; GI-NEXT: flat_load_dword v2, v[0:1]
343+ ; GI-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen
361344; GI-NEXT: v_mov_b32_e32 v0, s34
362345; GI-NEXT: v_mov_b32_e32 v1, s35
363- ; GI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
346+ ; GI-NEXT: s_waitcnt vmcnt(0)
364347; GI-NEXT: flat_store_dword v[0:1], v2
365348; GI-NEXT: s_endpgm
366349 %alloca = alloca i32 , i32 %n , align 4
@@ -478,24 +461,24 @@ define i32 @dynamic_alloca_i64(i64 %n) {
478461define amdgpu_kernel void @dynamic_alloca_i64_kernel (i64 %n , ptr %p ) {
479462; ISEL-LABEL: dynamic_alloca_i64_kernel:
480463; ISEL: ; %bb.0:
481- ; ISEL-NEXT: s_add_u32 flat_scratch_lo, s12, s17
482464; ISEL-NEXT: s_load_dwordx4 s[20:23], s[8:9], 0x0
465+ ; ISEL-NEXT: s_add_u32 flat_scratch_lo, s12, s17
483466; ISEL-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
484467; ISEL-NEXT: s_add_u32 s0, s0, s17
485- ; ISEL-NEXT: s_movk_i32 s32, 0x400
486468; ISEL-NEXT: s_addc_u32 s1, s1, 0
487- ; ISEL-NEXT: s_mov_b32 s13, s15
488469; ISEL-NEXT: s_mov_b32 s12, s14
489- ; ISEL-NEXT: s_mov_b64 s[14:15], src_private_base
490- ; ISEL-NEXT: s_cmp_lg_u32 s32, -1
491- ; ISEL-NEXT: s_cselect_b32 s15, s15, 0
492- ; ISEL-NEXT: s_cselect_b32 s17, s32, 0
493470; ISEL-NEXT: s_waitcnt lgkmcnt(0)
494471; ISEL-NEXT: s_lshl_b32 s14, s20, 2
495472; ISEL-NEXT: s_add_i32 s14, s14, 15
496473; ISEL-NEXT: s_and_b32 s14, s14, -16
474+ ; ISEL-NEXT: s_movk_i32 s32, 0x400
497475; ISEL-NEXT: s_lshl_b32 s14, s14, 6
498- ; ISEL-NEXT: s_add_i32 s32, s32, s14
476+ ; ISEL-NEXT: s_add_i32 s17, s32, s14
477+ ; ISEL-NEXT: s_mov_b32 s13, s15
478+ ; ISEL-NEXT: s_cmp_lg_u32 s32, -1
479+ ; ISEL-NEXT: s_mov_b64 s[14:15], src_private_base
480+ ; ISEL-NEXT: s_cselect_b32 s34, s32, 0
481+ ; ISEL-NEXT: s_cselect_b32 s15, s15, 0
499482; ISEL-NEXT: s_add_u32 s8, s8, 16
500483; ISEL-NEXT: s_addc_u32 s9, s9, 0
501484; ISEL-NEXT: v_lshlrev_b32_e32 v2, 20, v2
@@ -505,16 +488,16 @@ define amdgpu_kernel void @dynamic_alloca_i64_kernel(i64 %n, ptr %p) {
505488; ISEL-NEXT: s_addc_u32 s19, s19, bar@rel32@hi+12
506489; ISEL-NEXT: v_or3_b32 v31, v0, v1, v2
507490; ISEL-NEXT: s_mov_b32 s14, s16
508- ; ISEL-NEXT: v_mov_b32_e32 v0, s17
491+ ; ISEL-NEXT: v_mov_b32_e32 v0, s34
509492; ISEL-NEXT: v_mov_b32_e32 v1, s15
510493; ISEL-NEXT: s_mov_b32 s33, 0
511494; ISEL-NEXT: v_mov_b32_e32 v40, s22
512495; ISEL-NEXT: v_mov_b32_e32 v41, s23
513- ; ISEL-NEXT: v_mov_b32_e32 v42, s17
514- ; ISEL-NEXT: v_mov_b32_e32 v43, s15
496+ ; ISEL-NEXT: s_mov_b32 s32, s17
515497; ISEL-NEXT: s_swappc_b64 s[30:31], s[18:19]
516- ; ISEL-NEXT: flat_load_dword v0, v[42:43]
517- ; ISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
498+ ; ISEL-NEXT: v_mov_b32_e32 v0, s34
499+ ; ISEL-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
500+ ; ISEL-NEXT: s_waitcnt vmcnt(0)
518501; ISEL-NEXT: flat_store_dword v[40:41], v0
519502; ISEL-NEXT: s_endpgm
520503;
@@ -553,11 +536,10 @@ define amdgpu_kernel void @dynamic_alloca_i64_kernel(i64 %n, ptr %p) {
553536; GI-NEXT: s_mov_b32 s33, 0
554537; GI-NEXT: s_swappc_b64 s[30:31], s[18:19]
555538; GI-NEXT: v_mov_b32_e32 v0, s34
556- ; GI-NEXT: v_mov_b32_e32 v1, s35
557- ; GI-NEXT: flat_load_dword v2, v[0:1]
539+ ; GI-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen
558540; GI-NEXT: v_mov_b32_e32 v0, s38
559541; GI-NEXT: v_mov_b32_e32 v1, s39
560- ; GI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
542+ ; GI-NEXT: s_waitcnt vmcnt(0)
561543; GI-NEXT: flat_store_dword v[0:1], v2
562544; GI-NEXT: s_endpgm
563545 %alloca = alloca i32 , i64 %n , align 4
0 commit comments