@@ -257,20 +257,16 @@ define amdgpu_kernel void @store_load_vindex_kernel(i32 %n) {
257
257
; GFX12: ; %bb.0: ; %bb
258
258
; GFX12-NEXT: s_load_b32 s0, s[4:5], 0x0
259
259
; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
260
- ; GFX12-NEXT: v_mov_b32_e32 v2, 15
261
- ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
260
+ ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
262
261
; GFX12-NEXT: v_sub_nc_u32_e32 v1, 0, v0
262
+ ; GFX12-NEXT: v_mov_b32_e32 v2, 15
263
263
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
264
264
; GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v1
265
265
; GFX12-NEXT: s_wait_kmcnt 0x0
266
266
; GFX12-NEXT: s_lshl_b32 s0, s0, 7
267
- ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
268
- ; GFX12-NEXT: v_add_nc_u32_e32 v0, s0, v0
269
- ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
270
- ; GFX12-NEXT: v_add_nc_u32_e32 v1, s0, v1
271
- ; GFX12-NEXT: scratch_store_b32 v0, v2, off scope:SCOPE_SYS
267
+ ; GFX12-NEXT: scratch_store_b32 v0, v2, s0 scope:SCOPE_SYS
272
268
; GFX12-NEXT: s_wait_storecnt 0x0
273
- ; GFX12-NEXT: scratch_load_b32 v0, v1, off offset:124 scope:SCOPE_SYS
269
+ ; GFX12-NEXT: scratch_load_b32 v0, v1, s0 offset:124 scope:SCOPE_SYS
274
270
; GFX12-NEXT: s_wait_loadcnt 0x0
275
271
; GFX12-NEXT: s_endpgm
276
272
;
@@ -357,20 +353,16 @@ define amdgpu_kernel void @store_load_vindex_kernel(i32 %n) {
357
353
; UNALIGNED_GFX12: ; %bb.0: ; %bb
358
354
; UNALIGNED_GFX12-NEXT: s_load_b32 s0, s[4:5], 0x0
359
355
; UNALIGNED_GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
360
- ; UNALIGNED_GFX12-NEXT: v_mov_b32_e32 v2, 15
361
- ; UNALIGNED_GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
356
+ ; UNALIGNED_GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
362
357
; UNALIGNED_GFX12-NEXT: v_sub_nc_u32_e32 v1, 0, v0
358
+ ; UNALIGNED_GFX12-NEXT: v_mov_b32_e32 v2, 15
363
359
; UNALIGNED_GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
364
360
; UNALIGNED_GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v1
365
361
; UNALIGNED_GFX12-NEXT: s_wait_kmcnt 0x0
366
362
; UNALIGNED_GFX12-NEXT: s_lshl_b32 s0, s0, 7
367
- ; UNALIGNED_GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
368
- ; UNALIGNED_GFX12-NEXT: v_add_nc_u32_e32 v0, s0, v0
369
- ; UNALIGNED_GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
370
- ; UNALIGNED_GFX12-NEXT: v_add_nc_u32_e32 v1, s0, v1
371
- ; UNALIGNED_GFX12-NEXT: scratch_store_b32 v0, v2, off scope:SCOPE_SYS
363
+ ; UNALIGNED_GFX12-NEXT: scratch_store_b32 v0, v2, s0 scope:SCOPE_SYS
372
364
; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0
373
- ; UNALIGNED_GFX12-NEXT: scratch_load_b32 v0, v1, off offset:124 scope:SCOPE_SYS
365
+ ; UNALIGNED_GFX12-NEXT: scratch_load_b32 v0, v1, s0 offset:124 scope:SCOPE_SYS
374
366
; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0
375
367
; UNALIGNED_GFX12-NEXT: s_endpgm
376
368
bb:
@@ -937,19 +929,17 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel(i32 %n) {
937
929
; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
938
930
; GFX12-NEXT: scratch_load_b32 v3, off, off scope:SCOPE_SYS
939
931
; GFX12-NEXT: s_wait_loadcnt 0x0
940
- ; GFX12-NEXT: v_mov_b32_e32 v2, 15
941
932
; GFX12-NEXT: v_sub_nc_u32_e32 v1, 0, v0
933
+ ; GFX12-NEXT: v_mov_b32_e32 v2, 15
942
934
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
943
- ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2 ) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
935
+ ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3 ) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
944
936
; GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v1
945
937
; GFX12-NEXT: s_wait_kmcnt 0x0
946
938
; GFX12-NEXT: scratch_store_b32 v0, v2, off offset:384 scope:SCOPE_SYS
947
939
; GFX12-NEXT: s_wait_storecnt 0x0
948
940
; GFX12-NEXT: s_lshl_b32 s0, s0, 7
949
941
; GFX12-NEXT: s_add_co_u32 s0, 0x100, s0
950
- ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
951
- ; GFX12-NEXT: v_add_nc_u32_e32 v1, s0, v1
952
- ; GFX12-NEXT: scratch_load_b32 v0, v1, off offset:124 scope:SCOPE_SYS
942
+ ; GFX12-NEXT: scratch_load_b32 v0, v1, s0 offset:124 scope:SCOPE_SYS
953
943
; GFX12-NEXT: s_wait_loadcnt 0x0
954
944
; GFX12-NEXT: s_endpgm
955
945
;
@@ -1048,19 +1038,17 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel(i32 %n) {
1048
1038
; UNALIGNED_GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
1049
1039
; UNALIGNED_GFX12-NEXT: scratch_load_b32 v3, off, off scope:SCOPE_SYS
1050
1040
; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0
1051
- ; UNALIGNED_GFX12-NEXT: v_mov_b32_e32 v2, 15
1052
1041
; UNALIGNED_GFX12-NEXT: v_sub_nc_u32_e32 v1, 0, v0
1042
+ ; UNALIGNED_GFX12-NEXT: v_mov_b32_e32 v2, 15
1053
1043
; UNALIGNED_GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1054
- ; UNALIGNED_GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2 ) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
1044
+ ; UNALIGNED_GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3 ) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
1055
1045
; UNALIGNED_GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v1
1056
1046
; UNALIGNED_GFX12-NEXT: s_wait_kmcnt 0x0
1057
1047
; UNALIGNED_GFX12-NEXT: scratch_store_b32 v0, v2, off offset:384 scope:SCOPE_SYS
1058
1048
; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0
1059
1049
; UNALIGNED_GFX12-NEXT: s_lshl_b32 s0, s0, 7
1060
1050
; UNALIGNED_GFX12-NEXT: s_add_co_u32 s0, 0x100, s0
1061
- ; UNALIGNED_GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
1062
- ; UNALIGNED_GFX12-NEXT: v_add_nc_u32_e32 v1, s0, v1
1063
- ; UNALIGNED_GFX12-NEXT: scratch_load_b32 v0, v1, off offset:124 scope:SCOPE_SYS
1051
+ ; UNALIGNED_GFX12-NEXT: scratch_load_b32 v0, v1, s0 offset:124 scope:SCOPE_SYS
1064
1052
; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0
1065
1053
; UNALIGNED_GFX12-NEXT: s_endpgm
1066
1054
bb:
@@ -1579,19 +1567,17 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel(i32 %n) {
1579
1567
; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
1580
1568
; GFX12-NEXT: scratch_load_b32 v3, off, off scope:SCOPE_SYS
1581
1569
; GFX12-NEXT: s_wait_loadcnt 0x0
1582
- ; GFX12-NEXT: v_mov_b32_e32 v2, 15
1583
1570
; GFX12-NEXT: v_sub_nc_u32_e32 v1, 0, v0
1571
+ ; GFX12-NEXT: v_mov_b32_e32 v2, 15
1584
1572
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1585
- ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2 ) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
1573
+ ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3 ) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
1586
1574
; GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v1
1587
1575
; GFX12-NEXT: s_wait_kmcnt 0x0
1588
1576
; GFX12-NEXT: scratch_store_b32 v0, v2, off offset:16512 scope:SCOPE_SYS
1589
1577
; GFX12-NEXT: s_wait_storecnt 0x0
1590
1578
; GFX12-NEXT: s_lshl_b32 s0, s0, 7
1591
1579
; GFX12-NEXT: s_add_co_u32 s0, 0x4000, s0
1592
- ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
1593
- ; GFX12-NEXT: v_add_nc_u32_e32 v1, s0, v1
1594
- ; GFX12-NEXT: scratch_load_b32 v0, v1, off offset:124 scope:SCOPE_SYS
1580
+ ; GFX12-NEXT: scratch_load_b32 v0, v1, s0 offset:124 scope:SCOPE_SYS
1595
1581
; GFX12-NEXT: s_wait_loadcnt 0x0
1596
1582
; GFX12-NEXT: s_endpgm
1597
1583
;
@@ -1692,19 +1678,17 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel(i32 %n) {
1692
1678
; UNALIGNED_GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
1693
1679
; UNALIGNED_GFX12-NEXT: scratch_load_b32 v3, off, off scope:SCOPE_SYS
1694
1680
; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0
1695
- ; UNALIGNED_GFX12-NEXT: v_mov_b32_e32 v2, 15
1696
1681
; UNALIGNED_GFX12-NEXT: v_sub_nc_u32_e32 v1, 0, v0
1682
+ ; UNALIGNED_GFX12-NEXT: v_mov_b32_e32 v2, 15
1697
1683
; UNALIGNED_GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1698
- ; UNALIGNED_GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2 ) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
1684
+ ; UNALIGNED_GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3 ) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
1699
1685
; UNALIGNED_GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v1
1700
1686
; UNALIGNED_GFX12-NEXT: s_wait_kmcnt 0x0
1701
1687
; UNALIGNED_GFX12-NEXT: scratch_store_b32 v0, v2, off offset:16512 scope:SCOPE_SYS
1702
1688
; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0
1703
1689
; UNALIGNED_GFX12-NEXT: s_lshl_b32 s0, s0, 7
1704
1690
; UNALIGNED_GFX12-NEXT: s_add_co_u32 s0, 0x4000, s0
1705
- ; UNALIGNED_GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
1706
- ; UNALIGNED_GFX12-NEXT: v_add_nc_u32_e32 v1, s0, v1
1707
- ; UNALIGNED_GFX12-NEXT: scratch_load_b32 v0, v1, off offset:124 scope:SCOPE_SYS
1691
+ ; UNALIGNED_GFX12-NEXT: scratch_load_b32 v0, v1, s0 offset:124 scope:SCOPE_SYS
1708
1692
; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0
1709
1693
; UNALIGNED_GFX12-NEXT: s_endpgm
1710
1694
bb:
@@ -4060,9 +4044,7 @@ define amdgpu_gs void @sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset(ptr a
4060
4044
; GFX12-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset:
4061
4045
; GFX12: ; %bb.0: ; %bb
4062
4046
; GFX12-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_add_nc_u32 v0, s1, v0
4063
- ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
4064
- ; GFX12-NEXT: v_add_nc_u32_e32 v0, s0, v0
4065
- ; GFX12-NEXT: scratch_store_b32 v0, v1, off offset:65512 scope:SCOPE_SYS
4047
+ ; GFX12-NEXT: scratch_store_b32 v0, v1, s0 offset:65512 scope:SCOPE_SYS
4066
4048
; GFX12-NEXT: s_wait_storecnt 0x0
4067
4049
; GFX12-NEXT: s_endpgm
4068
4050
;
@@ -4113,9 +4095,7 @@ define amdgpu_gs void @sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset(ptr a
4113
4095
; UNALIGNED_GFX12-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset:
4114
4096
; UNALIGNED_GFX12: ; %bb.0: ; %bb
4115
4097
; UNALIGNED_GFX12-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_add_nc_u32 v0, s1, v0
4116
- ; UNALIGNED_GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
4117
- ; UNALIGNED_GFX12-NEXT: v_add_nc_u32_e32 v0, s0, v0
4118
- ; UNALIGNED_GFX12-NEXT: scratch_store_b32 v0, v1, off offset:65512 scope:SCOPE_SYS
4098
+ ; UNALIGNED_GFX12-NEXT: scratch_store_b32 v0, v1, s0 offset:65512 scope:SCOPE_SYS
4119
4099
; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0
4120
4100
; UNALIGNED_GFX12-NEXT: s_endpgm
4121
4101
bb:
@@ -4172,9 +4152,7 @@ define amdgpu_gs void @sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset(pt
4172
4152
; GFX12-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset:
4173
4153
; GFX12: ; %bb.0: ; %bb
4174
4154
; GFX12-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_add_nc_u32 v0, s1, v0
4175
- ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
4176
- ; GFX12-NEXT: v_add_nc_u32_e32 v0, s0, v0
4177
- ; GFX12-NEXT: scratch_store_b32 v0, v1, off offset:-16 scope:SCOPE_SYS
4155
+ ; GFX12-NEXT: scratch_store_b32 v0, v1, s0 offset:-16 scope:SCOPE_SYS
4178
4156
; GFX12-NEXT: s_wait_storecnt 0x0
4179
4157
; GFX12-NEXT: s_endpgm
4180
4158
;
@@ -4223,9 +4201,7 @@ define amdgpu_gs void @sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset(pt
4223
4201
; UNALIGNED_GFX12-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset:
4224
4202
; UNALIGNED_GFX12: ; %bb.0: ; %bb
4225
4203
; UNALIGNED_GFX12-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_add_nc_u32 v0, s1, v0
4226
- ; UNALIGNED_GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
4227
- ; UNALIGNED_GFX12-NEXT: v_add_nc_u32_e32 v0, s0, v0
4228
- ; UNALIGNED_GFX12-NEXT: scratch_store_b32 v0, v1, off offset:-16 scope:SCOPE_SYS
4204
+ ; UNALIGNED_GFX12-NEXT: scratch_store_b32 v0, v1, s0 offset:-16 scope:SCOPE_SYS
4229
4205
; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0
4230
4206
; UNALIGNED_GFX12-NEXT: s_endpgm
4231
4207
bb:
0 commit comments