Skip to content

Commit 8c75501

Browse files
[AMDGPU] Unused sdst writing to null (#133229)
Unused sdst writing to null to avoid a false VALU->SALU dependency stall. This requires using the VOP3 encoding.
1 parent 049f179 commit 8c75501

File tree

68 files changed

+2385
-2257
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

68 files changed

+2385
-2257
lines changed

llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -973,11 +973,13 @@ bool SIShrinkInstructions::run(MachineFunction &MF) {
973973
continue;
974974
}
975975

976-
if (!TII->hasVALU32BitEncoding(MI.getOpcode())) {
977-
// If there is no chance we will shrink it and use VCC as sdst to get
978-
// a 32 bit form try to replace dead sdst with NULL.
976+
// If there is no chance we will shrink it and use VCC as sdst to get
977+
// a 32 bit form try to replace dead sdst with NULL.
978+
if (TII->isVOP3(MI.getOpcode())) {
979979
tryReplaceDeadSDST(MI);
980-
continue;
980+
if (!TII->hasVALU32BitEncoding(MI.getOpcode())) {
981+
continue;
982+
}
981983
}
982984

983985
if (!TII->canShrink(MI, *MRI)) {

llvm/test/CodeGen/AMDGPU/GlobalISel/addsubu64.ll

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,8 @@ define amdgpu_ps void @v_add_u64(ptr addrspace(1) %out, i64 %a, i64 %b) {
3939
; GCN-LABEL: v_add_u64:
4040
; GCN: ; %bb.0: ; %entry
4141
; GCN-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
42-
; GCN-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo
42+
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
43+
; GCN-NEXT: v_add_co_ci_u32_e64 v3, null, v3, v5, vcc_lo
4344
; GCN-NEXT: global_store_b64 v[0:1], v[2:3], off
4445
; GCN-NEXT: s_endpgm
4546
entry:
@@ -85,7 +86,8 @@ define amdgpu_ps void @v_sub_u64(ptr addrspace(1) %out, i64 %a, i64 %b) {
8586
; GCN-LABEL: v_sub_u64:
8687
; GCN: ; %bb.0: ; %entry
8788
; GCN-NEXT: v_sub_co_u32 v2, vcc_lo, v2, v4
88-
; GCN-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo
89+
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
90+
; GCN-NEXT: v_sub_co_ci_u32_e64 v3, null, v3, v5, vcc_lo
8991
; GCN-NEXT: global_store_b64 v[0:1], v[2:3], off
9092
; GCN-NEXT: s_endpgm
9193
entry:

llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1440,16 +1440,16 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr %
14401440
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
14411441
; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
14421442
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
1443-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
1444-
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
1443+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1444+
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
14451445
; GFX11-NEXT: flat_atomic_dec_u32 v3, v[0:1], v3 offset:20 glc
14461446
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
14471447
; GFX11-NEXT: buffer_gl1_inv
14481448
; GFX11-NEXT: buffer_gl0_inv
14491449
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
1450-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
1450+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
14511451
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
1452-
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
1452+
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
14531453
; GFX11-NEXT: flat_store_b32 v[0:1], v3
14541454
; GFX11-NEXT: s_endpgm
14551455
%id = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1539,8 +1539,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(ptr %ptr) #1
15391539
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
15401540
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
15411541
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
1542-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
1543-
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
1542+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1543+
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
15441544
; GFX11-NEXT: v_mov_b32_e32 v2, 42
15451545
; GFX11-NEXT: flat_atomic_dec_u32 v[0:1], v2 offset:20
15461546
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
@@ -2090,16 +2090,16 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr %
20902090
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
20912091
; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
20922092
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
2093-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
2094-
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
2093+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
2094+
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
20952095
; GFX11-NEXT: flat_atomic_dec_u64 v[0:1], v[0:1], v[2:3] offset:40 glc
20962096
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
20972097
; GFX11-NEXT: buffer_gl1_inv
20982098
; GFX11-NEXT: buffer_gl0_inv
20992099
; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
2100-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
2100+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
21012101
; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
2102-
; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
2102+
; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
21032103
; GFX11-NEXT: flat_store_b64 v[2:3], v[0:1]
21042104
; GFX11-NEXT: s_endpgm
21052105
%id = call i32 @llvm.amdgcn.workitem.id.x()
@@ -2194,8 +2194,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(ptr %ptr) #1
21942194
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
21952195
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
21962196
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
2197-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
2198-
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
2197+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
2198+
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
21992199
; GFX11-NEXT: flat_atomic_dec_u64 v[0:1], v[2:3] offset:40
22002200
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
22012201
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0

llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll

Lines changed: 22 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -2824,16 +2824,16 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr %
28242824
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
28252825
; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
28262826
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
2827-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
2828-
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
2827+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
2828+
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
28292829
; GFX11-NEXT: flat_atomic_inc_u32 v3, v[0:1], v3 offset:20 glc
28302830
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
28312831
; GFX11-NEXT: buffer_gl1_inv
28322832
; GFX11-NEXT: buffer_gl0_inv
28332833
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
2834-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
2834+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
28352835
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
2836-
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
2836+
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
28372837
; GFX11-NEXT: flat_store_b32 v[0:1], v3
28382838
; GFX11-NEXT: s_endpgm
28392839
;
@@ -2846,15 +2846,15 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr %
28462846
; GFX12-NEXT: s_wait_kmcnt 0x0
28472847
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
28482848
; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
2849-
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_1)
2850-
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
2849+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
2850+
; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
28512851
; GFX12-NEXT: flat_atomic_inc_u32 v3, v[0:1], v3 offset:20 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
28522852
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
28532853
; GFX12-NEXT: global_inv scope:SCOPE_DEV
28542854
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
28552855
; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
28562856
; GFX12-NEXT: s_wait_alu 0xfffd
2857-
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
2857+
; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
28582858
; GFX12-NEXT: flat_store_b32 v[0:1], v3
28592859
; GFX12-NEXT: s_endpgm
28602860
%id = call i32 @llvm.amdgcn.workitem.id.x()
@@ -2944,8 +2944,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(ptr %ptr) #1
29442944
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
29452945
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
29462946
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
2947-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
2948-
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
2947+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
2948+
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
29492949
; GFX11-NEXT: v_mov_b32_e32 v2, 42
29502950
; GFX11-NEXT: flat_atomic_inc_u32 v[0:1], v2 offset:20
29512951
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
@@ -2963,8 +2963,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(ptr %ptr) #1
29632963
; GFX12-NEXT: s_wait_kmcnt 0x0
29642964
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
29652965
; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
2966-
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
2967-
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
2966+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
2967+
; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
29682968
; GFX12-NEXT: v_mov_b32_e32 v2, 42
29692969
; GFX12-NEXT: flat_atomic_inc_u32 v[0:1], v2 offset:20 scope:SCOPE_DEV
29702970
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
@@ -3810,16 +3810,16 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr %
38103810
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
38113811
; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
38123812
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
3813-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
3814-
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
3813+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
3814+
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
38153815
; GFX11-NEXT: flat_atomic_inc_u64 v[0:1], v[0:1], v[2:3] offset:40 glc
38163816
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
38173817
; GFX11-NEXT: buffer_gl1_inv
38183818
; GFX11-NEXT: buffer_gl0_inv
38193819
; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
3820-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
3820+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
38213821
; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
3822-
; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
3822+
; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
38233823
; GFX11-NEXT: flat_store_b64 v[2:3], v[0:1]
38243824
; GFX11-NEXT: s_endpgm
38253825
;
@@ -3833,15 +3833,15 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr %
38333833
; GFX12-NEXT: s_wait_kmcnt 0x0
38343834
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
38353835
; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
3836-
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_1)
3837-
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
3836+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
3837+
; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
38383838
; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[0:1], v[2:3] offset:40 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
38393839
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
38403840
; GFX12-NEXT: global_inv scope:SCOPE_DEV
38413841
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
38423842
; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
38433843
; GFX12-NEXT: s_wait_alu 0xfffd
3844-
; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
3844+
; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
38453845
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
38463846
; GFX12-NEXT: s_endpgm
38473847
%id = call i32 @llvm.amdgcn.workitem.id.x()
@@ -3936,8 +3936,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #1
39363936
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
39373937
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
39383938
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
3939-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
3940-
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
3939+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
3940+
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
39413941
; GFX11-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3] offset:40
39423942
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
39433943
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
@@ -3955,8 +3955,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #1
39553955
; GFX12-NEXT: s_wait_kmcnt 0x0
39563956
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
39573957
; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
3958-
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
3959-
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
3958+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
3959+
; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
39603960
; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3] offset:40 scope:SCOPE_DEV
39613961
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
39623962
; GFX12-NEXT: global_inv scope:SCOPE_DEV

llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -29,10 +29,9 @@ define i32 @v_extract_v64i32_varidx(ptr addrspace(1) %ptr, i32 %idx) {
2929
; GFX12-NEXT: v_lshlrev_b32_e32 v2, 2, v2
3030
; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
3131
; GFX12-NEXT: s_wait_alu 0xfffd
32-
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
32+
; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
3333
; GFX12-NEXT: global_load_b32 v0, v[0:1], off
3434
; GFX12-NEXT: s_wait_loadcnt 0x0
35-
; GFX12-NEXT: s_wait_alu 0xfffd
3635
; GFX12-NEXT: s_setpc_b64 s[30:31]
3736
%vec = load <64 x i32>, ptr addrspace(1) %ptr
3837
%elt = extractelement <64 x i32> %vec, i32 %idx
@@ -63,10 +62,9 @@ define i16 @v_extract_v128i16_varidx(ptr addrspace(1) %ptr, i32 %idx) {
6362
; GFX12-NEXT: v_lshlrev_b32_e32 v2, 1, v2
6463
; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
6564
; GFX12-NEXT: s_wait_alu 0xfffd
66-
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
65+
; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
6766
; GFX12-NEXT: global_load_u16 v0, v[0:1], off
6867
; GFX12-NEXT: s_wait_loadcnt 0x0
69-
; GFX12-NEXT: s_wait_alu 0xfffd
7068
; GFX12-NEXT: s_setpc_b64 s[30:31]
7169
%vec = load <128 x i16>, ptr addrspace(1) %ptr
7270
%elt = extractelement <128 x i16> %vec, i32 %idx
@@ -97,10 +95,9 @@ define i64 @v_extract_v32i64_varidx(ptr addrspace(1) %ptr, i32 %idx) {
9795
; GFX12-NEXT: v_lshlrev_b32_e32 v2, 3, v2
9896
; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
9997
; GFX12-NEXT: s_wait_alu 0xfffd
100-
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
98+
; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
10199
; GFX12-NEXT: global_load_b64 v[0:1], v[0:1], off
102100
; GFX12-NEXT: s_wait_loadcnt 0x0
103-
; GFX12-NEXT: s_wait_alu 0xfffd
104101
; GFX12-NEXT: s_setpc_b64 s[30:31]
105102
%vec = load <32 x i64>, ptr addrspace(1) %ptr
106103
%elt = extractelement <32 x i64> %vec, i32 %idx

llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -126,8 +126,8 @@ define amdgpu_ps i128 @extractelement_vgpr_v4i128_sgpr_idx(ptr addrspace(1) %ptr
126126
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
127127
; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
128128
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
129-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
130-
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
129+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
130+
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
131131
; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off
132132
; GFX11-NEXT: s_waitcnt vmcnt(0)
133133
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
@@ -195,7 +195,8 @@ define i128 @extractelement_vgpr_v4i128_vgpr_idx(ptr addrspace(1) %ptr, i32 %idx
195195
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
196196
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 4, v2
197197
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
198-
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
198+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
199+
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
199200
; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off
200201
; GFX11-NEXT: s_waitcnt vmcnt(0)
201202
; GFX11-NEXT: s_setpc_b64 s[30:31]

llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -133,8 +133,8 @@ define amdgpu_ps i16 @extractelement_vgpr_v4i16_sgpr_idx(ptr addrspace(1) %ptr,
133133
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
134134
; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
135135
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
136-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
137-
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
136+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
137+
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
138138
; GFX11-NEXT: global_load_u16 v0, v[0:1], off
139139
; GFX11-NEXT: s_waitcnt vmcnt(0)
140140
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
@@ -199,7 +199,8 @@ define i16 @extractelement_vgpr_v4i16_vgpr_idx(ptr addrspace(1) %ptr, i32 %idx)
199199
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
200200
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 1, v2
201201
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
202-
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
202+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
203+
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
203204
; GFX11-NEXT: global_load_u16 v0, v[0:1], off
204205
; GFX11-NEXT: s_waitcnt vmcnt(0)
205206
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -776,8 +777,8 @@ define amdgpu_ps i16 @extractelement_vgpr_v8i16_sgpr_idx(ptr addrspace(1) %ptr,
776777
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
777778
; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
778779
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
779-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
780-
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
780+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
781+
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
781782
; GFX11-NEXT: global_load_u16 v0, v[0:1], off
782783
; GFX11-NEXT: s_waitcnt vmcnt(0)
783784
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
@@ -842,7 +843,8 @@ define i16 @extractelement_vgpr_v8i16_vgpr_idx(ptr addrspace(1) %ptr, i32 %idx)
842843
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
843844
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 1, v2
844845
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
845-
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
846+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
847+
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
846848
; GFX11-NEXT: global_load_u16 v0, v[0:1], off
847849
; GFX11-NEXT: s_waitcnt vmcnt(0)
848850
; GFX11-NEXT: s_setpc_b64 s[30:31]

0 commit comments

Comments
 (0)