@@ -55,41 +55,40 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
5555; GFX940-NEXT: s_lshl_b32 s0, s0, 2
5656; GFX940-NEXT: scratch_store_dword off, v0, s1 sc0 sc1
5757; GFX940-NEXT: s_waitcnt vmcnt(0)
58- ; GFX940-NEXT: v_mov_b32_e32 v0 , s0
59- ; GFX940-NEXT: scratch_load_dword v0, v0, off sc0 sc1
58+ ; GFX940-NEXT: s_add_i32 s0 , s0, 0
59+ ; GFX940-NEXT: scratch_load_dword v0, off, s0 sc0 sc1
6060; GFX940-NEXT: s_waitcnt vmcnt(0)
6161; GFX940-NEXT: s_endpgm
6262;
6363; GFX11-LABEL: store_load_sindex_kernel:
6464; GFX11: ; %bb.0: ; %bb
6565; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x24
66+ ; GFX11-NEXT: v_mov_b32_e32 v0, 15
6667; GFX11-NEXT: s_waitcnt lgkmcnt(0)
6768; GFX11-NEXT: s_and_b32 s1, s0, 15
6869; GFX11-NEXT: s_lshl_b32 s0, s0, 2
6970; GFX11-NEXT: s_lshl_b32 s1, s1, 2
70- ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
71- ; GFX11-NEXT: v_dual_mov_b32 v0, 15 :: v_dual_mov_b32 v1, s1
7271; GFX11-NEXT: s_add_i32 s0, s0, 0
72+ ; GFX11-NEXT: s_add_i32 s1, s1, 0
7373; GFX11-NEXT: scratch_store_b32 off, v0, s0 dlc
7474; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
75- ; GFX11-NEXT: scratch_load_b32 v0, v1, off glc dlc
75+ ; GFX11-NEXT: scratch_load_b32 v0, off, s1 glc dlc
7676; GFX11-NEXT: s_waitcnt vmcnt(0)
7777; GFX11-NEXT: s_endpgm
7878;
7979; GFX12-LABEL: store_load_sindex_kernel:
8080; GFX12: ; %bb.0: ; %bb
8181; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x24
82- ; GFX12-NEXT: v_mov_b32_e32 v1 , 15
82+ ; GFX12-NEXT: v_mov_b32_e32 v0 , 15
8383; GFX12-NEXT: s_wait_kmcnt 0x0
84- ; GFX12-NEXT: s_lshl_b32 s1, s0, 2
85- ; GFX12-NEXT: s_and_b32 s0, s0, 15
86- ; GFX12-NEXT: v_mov_b32_e32 v0, s1
84+ ; GFX12-NEXT: s_and_b32 s1, s0, 15
8785; GFX12-NEXT: s_lshl_b32 s0, s0, 2
88- ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
89- ; GFX12-NEXT: v_mov_b32_e32 v2, s0
90- ; GFX12-NEXT: scratch_store_b32 v0, v1, off scope:SCOPE_SYS
86+ ; GFX12-NEXT: s_lshl_b32 s1, s1, 2
87+ ; GFX12-NEXT: s_add_co_i32 s0, s0, 0
88+ ; GFX12-NEXT: s_add_co_i32 s1, s1, 0
89+ ; GFX12-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SYS
9190; GFX12-NEXT: s_wait_storecnt 0x0
92- ; GFX12-NEXT: scratch_load_b32 v0, v2, off scope:SCOPE_SYS
91+ ; GFX12-NEXT: scratch_load_b32 v0, off, s1 scope:SCOPE_SYS
9392; GFX12-NEXT: s_wait_loadcnt 0x0
9493; GFX12-NEXT: s_endpgm
9594bb:
@@ -378,44 +377,44 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
378377; GFX940-NEXT: s_lshl_b32 s0, s0, 2
379378; GFX940-NEXT: scratch_store_dword off, v0, s1 sc0 sc1
380379; GFX940-NEXT: s_waitcnt vmcnt(0)
381- ; GFX940-NEXT: v_mov_b32_e32 v0, s0
382- ; GFX940-NEXT: scratch_load_dword v0, v0, off offset:256 sc0 sc1
380+ ; GFX940-NEXT: s_addk_i32 s0, 0x100
381+ ; GFX940-NEXT: scratch_load_dword v0, off, s0 sc0 sc1
383382; GFX940-NEXT: s_waitcnt vmcnt(0)
384383; GFX940-NEXT: s_endpgm
385384;
386385; GFX11-LABEL: store_load_sindex_small_offset_kernel:
387386; GFX11: ; %bb.0: ; %bb
388387; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x24
389- ; GFX11-NEXT: scratch_load_b32 v2, off, off glc dlc
390- ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
388+ ; GFX11-NEXT: scratch_load_b32 v0, off, off glc dlc
389+ ; GFX11-NEXT: s_waitcnt vmcnt(0)
390+ ; GFX11-NEXT: v_mov_b32_e32 v0, 15
391+ ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
391392; GFX11-NEXT: s_and_b32 s1, s0, 15
392393; GFX11-NEXT: s_lshl_b32 s0, s0, 2
393394; GFX11-NEXT: s_lshl_b32 s1, s1, 2
394- ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
395- ; GFX11-NEXT: v_dual_mov_b32 v0, 15 :: v_dual_mov_b32 v1, s1
396395; GFX11-NEXT: s_addk_i32 s0, 0x100
396+ ; GFX11-NEXT: s_addk_i32 s1, 0x100
397397; GFX11-NEXT: scratch_store_b32 off, v0, s0 dlc
398398; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
399- ; GFX11-NEXT: scratch_load_b32 v0, v1, off offset:256 glc dlc
399+ ; GFX11-NEXT: scratch_load_b32 v0, off, s1 glc dlc
400400; GFX11-NEXT: s_waitcnt vmcnt(0)
401401; GFX11-NEXT: s_endpgm
402402;
403403; GFX12-LABEL: store_load_sindex_small_offset_kernel:
404404; GFX12: ; %bb.0: ; %bb
405405; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x24
406- ; GFX12-NEXT: scratch_load_b32 v3 , off, off scope:SCOPE_SYS
406+ ; GFX12-NEXT: scratch_load_b32 v0 , off, off scope:SCOPE_SYS
407407; GFX12-NEXT: s_wait_loadcnt 0x0
408- ; GFX12-NEXT: v_mov_b32_e32 v1 , 15
408+ ; GFX12-NEXT: v_mov_b32_e32 v0 , 15
409409; GFX12-NEXT: s_wait_kmcnt 0x0
410- ; GFX12-NEXT: s_lshl_b32 s1, s0, 2
411- ; GFX12-NEXT: s_and_b32 s0, s0, 15
412- ; GFX12-NEXT: v_mov_b32_e32 v0, s1
410+ ; GFX12-NEXT: s_and_b32 s1, s0, 15
413411; GFX12-NEXT: s_lshl_b32 s0, s0, 2
414- ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
415- ; GFX12-NEXT: v_mov_b32_e32 v2, s0
416- ; GFX12-NEXT: scratch_store_b32 v0, v1, off offset:256 scope:SCOPE_SYS
412+ ; GFX12-NEXT: s_lshl_b32 s1, s1, 2
413+ ; GFX12-NEXT: s_addk_co_i32 s0, 0x100
414+ ; GFX12-NEXT: s_addk_co_i32 s1, 0x100
415+ ; GFX12-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SYS
417416; GFX12-NEXT: s_wait_storecnt 0x0
418- ; GFX12-NEXT: scratch_load_b32 v0, v2, off offset:256 scope:SCOPE_SYS
417+ ; GFX12-NEXT: scratch_load_b32 v0, off, s1 scope:SCOPE_SYS
419418; GFX12-NEXT: s_wait_loadcnt 0x0
420419; GFX12-NEXT: s_endpgm
421420bb:
@@ -692,46 +691,44 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) {
692691; GFX940-NEXT: s_lshl_b32 s0, s0, 2
693692; GFX940-NEXT: scratch_store_dword off, v0, s1 sc0 sc1
694693; GFX940-NEXT: s_waitcnt vmcnt(0)
695- ; GFX940-NEXT: v_mov_b32_e32 v0, s0
696- ; GFX940-NEXT: s_movk_i32 s0, 0x4004
697- ; GFX940-NEXT: scratch_load_dword v0, v0, s0 sc0 sc1
694+ ; GFX940-NEXT: s_addk_i32 s0, 0x4004
695+ ; GFX940-NEXT: scratch_load_dword v0, off, s0 sc0 sc1
698696; GFX940-NEXT: s_waitcnt vmcnt(0)
699697; GFX940-NEXT: s_endpgm
700698;
701699; GFX11-LABEL: store_load_sindex_large_offset_kernel:
702700; GFX11: ; %bb.0: ; %bb
703701; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x24
704- ; GFX11-NEXT: scratch_load_b32 v2, off, off offset:4 glc dlc
705- ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
702+ ; GFX11-NEXT: scratch_load_b32 v0, off, off offset:4 glc dlc
703+ ; GFX11-NEXT: s_waitcnt vmcnt(0)
704+ ; GFX11-NEXT: v_mov_b32_e32 v0, 15
705+ ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
706706; GFX11-NEXT: s_and_b32 s1, s0, 15
707707; GFX11-NEXT: s_lshl_b32 s0, s0, 2
708708; GFX11-NEXT: s_lshl_b32 s1, s1, 2
709- ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
710- ; GFX11-NEXT: v_dual_mov_b32 v0, 15 :: v_dual_mov_b32 v1, s1
711709; GFX11-NEXT: s_addk_i32 s0, 0x4004
710+ ; GFX11-NEXT: s_addk_i32 s1, 0x4004
712711; GFX11-NEXT: scratch_store_b32 off, v0, s0 dlc
713712; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
714- ; GFX11-NEXT: s_movk_i32 s0, 0x4004
715- ; GFX11-NEXT: scratch_load_b32 v0, v1, s0 glc dlc
713+ ; GFX11-NEXT: scratch_load_b32 v0, off, s1 glc dlc
716714; GFX11-NEXT: s_waitcnt vmcnt(0)
717715; GFX11-NEXT: s_endpgm
718716;
719717; GFX12-LABEL: store_load_sindex_large_offset_kernel:
720718; GFX12: ; %bb.0: ; %bb
721719; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x24
722- ; GFX12-NEXT: scratch_load_b32 v3 , off, off scope:SCOPE_SYS
720+ ; GFX12-NEXT: scratch_load_b32 v0 , off, off scope:SCOPE_SYS
723721; GFX12-NEXT: s_wait_loadcnt 0x0
724- ; GFX12-NEXT: v_mov_b32_e32 v1 , 15
722+ ; GFX12-NEXT: v_mov_b32_e32 v0 , 15
725723; GFX12-NEXT: s_wait_kmcnt 0x0
726- ; GFX12-NEXT: s_lshl_b32 s1, s0, 2
727- ; GFX12-NEXT: s_and_b32 s0, s0, 15
728- ; GFX12-NEXT: v_mov_b32_e32 v0, s1
724+ ; GFX12-NEXT: s_and_b32 s1, s0, 15
729725; GFX12-NEXT: s_lshl_b32 s0, s0, 2
730- ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
731- ; GFX12-NEXT: v_mov_b32_e32 v2, s0
732- ; GFX12-NEXT: scratch_store_b32 v0, v1, off offset:16384 scope:SCOPE_SYS
726+ ; GFX12-NEXT: s_lshl_b32 s1, s1, 2
727+ ; GFX12-NEXT: s_addk_co_i32 s0, 0x4000
728+ ; GFX12-NEXT: s_addk_co_i32 s1, 0x4000
729+ ; GFX12-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SYS
733730; GFX12-NEXT: s_wait_storecnt 0x0
734- ; GFX12-NEXT: scratch_load_b32 v0, v2, off offset:16384 scope:SCOPE_SYS
731+ ; GFX12-NEXT: scratch_load_b32 v0, off, s1 scope:SCOPE_SYS
735732; GFX12-NEXT: s_wait_loadcnt 0x0
736733; GFX12-NEXT: s_endpgm
737734bb:
@@ -995,25 +992,28 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
995992; GFX940-LABEL: store_load_large_imm_offset_kernel:
996993; GFX940: ; %bb.0: ; %bb
997994; GFX940-NEXT: v_mov_b32_e32 v0, 13
995+ ; GFX940-NEXT: s_movk_i32 s0, 0x3e80
998996; GFX940-NEXT: scratch_store_dword off, v0, off offset:4 sc0 sc1
999997; GFX940-NEXT: s_waitcnt vmcnt(0)
1000- ; GFX940-NEXT: v_mov_b32_e32 v0, 0x3e80
1001- ; GFX940-NEXT: v_mov_b32_e32 v1, 15
1002- ; GFX940-NEXT: scratch_store_dword v0, v1, off offset:4 sc0 sc1
998+ ; GFX940-NEXT: v_mov_b32_e32 v0, 15
999+ ; GFX940-NEXT: s_add_i32 s0, s0, 4
1000+ ; GFX940-NEXT: scratch_store_dword off, v0, s0 sc0 sc1
10031001; GFX940-NEXT: s_waitcnt vmcnt(0)
1004- ; GFX940-NEXT: scratch_load_dword v0, v0, off offset:4 sc0 sc1
1002+ ; GFX940-NEXT: scratch_load_dword v0, off, s0 sc0 sc1
10051003; GFX940-NEXT: s_waitcnt vmcnt(0)
10061004; GFX940-NEXT: s_endpgm
10071005;
10081006; GFX11-LABEL: store_load_large_imm_offset_kernel:
10091007; GFX11: ; %bb.0: ; %bb
1010- ; GFX11-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 0x3e80
1011- ; GFX11-NEXT: v_mov_b32_e32 v2, 15
1008+ ; GFX11-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15
1009+ ; GFX11-NEXT: s_movk_i32 s0, 0x3e80
1010+ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
1011+ ; GFX11-NEXT: s_add_i32 s0, s0, 4
10121012; GFX11-NEXT: scratch_store_b32 off, v0, off offset:4 dlc
10131013; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
1014- ; GFX11-NEXT: scratch_store_b32 v1, v2, off offset:4 dlc
1014+ ; GFX11-NEXT: scratch_store_b32 off, v1, s0 dlc
10151015; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
1016- ; GFX11-NEXT: scratch_load_b32 v0, v1, off offset:4 glc dlc
1016+ ; GFX11-NEXT: scratch_load_b32 v0, off, s0 glc dlc
10171017; GFX11-NEXT: s_waitcnt vmcnt(0)
10181018; GFX11-NEXT: s_endpgm
10191019;
@@ -1075,26 +1075,31 @@ define void @store_load_large_imm_offset_foo() {
10751075; GFX940: ; %bb.0: ; %bb
10761076; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10771077; GFX940-NEXT: v_mov_b32_e32 v0, 13
1078+ ; GFX940-NEXT: s_movk_i32 s0, 0x3e80
1079+ ; GFX940-NEXT: s_add_i32 s1, s32, 4
10781080; GFX940-NEXT: scratch_store_dword off, v0, s32 offset:4 sc0 sc1
10791081; GFX940-NEXT: s_waitcnt vmcnt(0)
1080- ; GFX940-NEXT: v_mov_b32_e32 v0, 0x3e80
1081- ; GFX940-NEXT: v_mov_b32_e32 v1, 15
1082- ; GFX940-NEXT: scratch_store_dword v0, v1, s32 offset:4 sc0 sc1
1082+ ; GFX940-NEXT: v_mov_b32_e32 v0, 15
1083+ ; GFX940-NEXT: s_add_i32 s0, s0, s1
1084+ ; GFX940-NEXT: scratch_store_dword off, v0, s0 sc0 sc1
10831085; GFX940-NEXT: s_waitcnt vmcnt(0)
1084- ; GFX940-NEXT: scratch_load_dword v0, v0, s32 offset:4 sc0 sc1
1086+ ; GFX940-NEXT: scratch_load_dword v0, off, s0 sc0 sc1
10851087; GFX940-NEXT: s_waitcnt vmcnt(0)
10861088; GFX940-NEXT: s_setpc_b64 s[30:31]
10871089;
10881090; GFX11-LABEL: store_load_large_imm_offset_foo:
10891091; GFX11: ; %bb.0: ; %bb
10901092; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1091- ; GFX11-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 0x3e80
1092- ; GFX11-NEXT: v_mov_b32_e32 v2, 15
1093+ ; GFX11-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15
1094+ ; GFX11-NEXT: s_movk_i32 s0, 0x3e80
1095+ ; GFX11-NEXT: s_add_i32 s1, s32, 4
1096+ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
1097+ ; GFX11-NEXT: s_add_i32 s0, s0, s1
10931098; GFX11-NEXT: scratch_store_b32 off, v0, s32 offset:4 dlc
10941099; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
1095- ; GFX11-NEXT: scratch_store_b32 v1, v2, s32 offset:4 dlc
1100+ ; GFX11-NEXT: scratch_store_b32 off, v1, s0 dlc
10961101; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
1097- ; GFX11-NEXT: scratch_load_b32 v0, v1, s32 offset:4 glc dlc
1102+ ; GFX11-NEXT: scratch_load_b32 v0, off, s0 glc dlc
10981103; GFX11-NEXT: s_waitcnt vmcnt(0)
10991104; GFX11-NEXT: s_setpc_b64 s[30:31]
11001105;
0 commit comments