Skip to content

Commit 3277c7c

Browse files
authored
[AMDGPU] Skip VGPR deallocation for waveslot limited kernels (#112765)
MSG_DEALLOC_VGPRS slows down very small waveslot limited kernels. It's been identified this message is only really needed for VGPR limited kernels. A kernel becomes VGPR limited if a total number of VGPRs per SIMD / number of used VGPRs is more than a number of wave slots.
1 parent 006fb09 commit 3277c7c

File tree

257 files changed

+335
-9655
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

257 files changed

+335
-9655
lines changed

llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp

Lines changed: 17 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2606,15 +2606,24 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
26062606

26072607
// Insert DEALLOC_VGPR messages before previously identified S_ENDPGM
26082608
// instructions.
2609-
for (MachineInstr *MI : ReleaseVGPRInsts) {
2610-
if (ST->requiresNopBeforeDeallocVGPRs()) {
2611-
BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::S_NOP))
2612-
.addImm(0);
2609+
// Skip deallocation if kernel is waveslot limited vs VGPR limited. A short
2610+
// waveslot limited kernel runs slower with the deallocation.
2611+
if (!ReleaseVGPRInsts.empty() &&
2612+
(MF.getFrameInfo().hasCalls() ||
2613+
ST->getOccupancyWithNumVGPRs(
2614+
TRI->getNumUsedPhysRegs(*MRI, AMDGPU::VGPR_32RegClass)) <
2615+
AMDGPU::IsaInfo::getMaxWavesPerEU(ST))) {
2616+
for (MachineInstr *MI : ReleaseVGPRInsts) {
2617+
if (ST->requiresNopBeforeDeallocVGPRs()) {
2618+
BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
2619+
TII->get(AMDGPU::S_NOP))
2620+
.addImm(0);
2621+
}
2622+
BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
2623+
TII->get(AMDGPU::S_SENDMSG))
2624+
.addImm(AMDGPU::SendMsg::ID_DEALLOC_VGPRS_GFX11Plus);
2625+
Modified = true;
26132626
}
2614-
BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
2615-
TII->get(AMDGPU::S_SENDMSG))
2616-
.addImm(AMDGPU::SendMsg::ID_DEALLOC_VGPRS_GFX11Plus);
2617-
Modified = true;
26182627
}
26192628
ReleaseVGPRInsts.clear();
26202629
PreheadersToFlush.clear();

llvm/test/CodeGen/AMDGPU/GlobalISel/addsubu64.ll

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,6 @@ define amdgpu_kernel void @s_add_u64(ptr addrspace(1) %out, i64 %a, i64 %b) {
1515
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
1616
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
1717
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
18-
; GFX11-NEXT: s_nop 0
19-
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2018
; GFX11-NEXT: s_endpgm
2119
;
2220
; GFX12-LABEL: s_add_u64:
@@ -30,8 +28,6 @@ define amdgpu_kernel void @s_add_u64(ptr addrspace(1) %out, i64 %a, i64 %b) {
3028
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
3129
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
3230
; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
33-
; GFX12-NEXT: s_nop 0
34-
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
3531
; GFX12-NEXT: s_endpgm
3632
entry:
3733
%add = add i64 %a, %b
@@ -45,8 +41,6 @@ define amdgpu_ps void @v_add_u64(ptr addrspace(1) %out, i64 %a, i64 %b) {
4541
; GCN-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
4642
; GCN-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo
4743
; GCN-NEXT: global_store_b64 v[0:1], v[2:3], off
48-
; GCN-NEXT: s_nop 0
49-
; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
5044
; GCN-NEXT: s_endpgm
5145
entry:
5246
%add = add i64 %a, %b
@@ -67,8 +61,6 @@ define amdgpu_kernel void @s_sub_u64(ptr addrspace(1) %out, i64 %a, i64 %b) {
6761
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
6862
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
6963
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
70-
; GFX11-NEXT: s_nop 0
71-
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
7264
; GFX11-NEXT: s_endpgm
7365
;
7466
; GFX12-LABEL: s_sub_u64:
@@ -82,8 +74,6 @@ define amdgpu_kernel void @s_sub_u64(ptr addrspace(1) %out, i64 %a, i64 %b) {
8274
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
8375
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
8476
; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
85-
; GFX12-NEXT: s_nop 0
86-
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
8777
; GFX12-NEXT: s_endpgm
8878
entry:
8979
%sub = sub i64 %a, %b
@@ -97,8 +87,6 @@ define amdgpu_ps void @v_sub_u64(ptr addrspace(1) %out, i64 %a, i64 %b) {
9787
; GCN-NEXT: v_sub_co_u32 v2, vcc_lo, v2, v4
9888
; GCN-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo
9989
; GCN-NEXT: global_store_b64 v[0:1], v[2:3], off
100-
; GCN-NEXT: s_nop 0
101-
; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
10290
; GCN-NEXT: s_endpgm
10391
entry:
10492
%sub = sub i64 %a, %b

llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll

Lines changed: 0 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -84,8 +84,6 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr add
8484
; GFX11-NEXT: buffer_gl0_inv
8585
; GFX11-NEXT: v_mov_b32_e32 v1, 0
8686
; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
87-
; GFX11-NEXT: s_nop 0
88-
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
8987
; GFX11-NEXT: s_endpgm
9088
%result = atomicrmw udec_wrap ptr addrspace(3) %ptr, i32 42 syncscope("agent") seq_cst, align 4
9189
store i32 %result, ptr addrspace(1) %out, align 4
@@ -163,8 +161,6 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32_offset(ptr addrspace(1) %out,
163161
; GFX11-NEXT: buffer_gl0_inv
164162
; GFX11-NEXT: v_mov_b32_e32 v1, 0
165163
; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
166-
; GFX11-NEXT: s_nop 0
167-
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
168164
; GFX11-NEXT: s_endpgm
169165
%gep = getelementptr i32, ptr addrspace(3) %ptr, i32 4
170166
%result = atomicrmw udec_wrap ptr addrspace(3) %gep, i32 42 syncscope("agent") seq_cst, align 4
@@ -353,8 +349,6 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr
353349
; GFX11-NEXT: buffer_gl1_inv
354350
; GFX11-NEXT: buffer_gl0_inv
355351
; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
356-
; GFX11-NEXT: s_nop 0
357-
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
358352
; GFX11-NEXT: s_endpgm
359353
%result = atomicrmw udec_wrap ptr addrspace(1) %ptr, i32 42 syncscope("agent") seq_cst, align 4
360354
store i32 %result, ptr addrspace(1) %out, align 4
@@ -431,8 +425,6 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset(ptr addrspace(1) %ou
431425
; GFX11-NEXT: buffer_gl1_inv
432426
; GFX11-NEXT: buffer_gl0_inv
433427
; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
434-
; GFX11-NEXT: s_nop 0
435-
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
436428
; GFX11-NEXT: s_endpgm
437429
%gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4
438430
%result = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 42 syncscope("agent") seq_cst, align 4
@@ -510,8 +502,6 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_system(ptr addrspace
510502
; GFX11-NEXT: buffer_gl1_inv
511503
; GFX11-NEXT: buffer_gl0_inv
512504
; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
513-
; GFX11-NEXT: s_nop 0
514-
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
515505
; GFX11-NEXT: s_endpgm
516506
%gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4
517507
%result = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 42 seq_cst, align 4
@@ -797,8 +787,6 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(ptr addrspace
797787
; GFX11-NEXT: buffer_gl1_inv
798788
; GFX11-NEXT: buffer_gl0_inv
799789
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
800-
; GFX11-NEXT: s_nop 0
801-
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
802790
; GFX11-NEXT: s_endpgm
803791
%id = call i32 @llvm.amdgcn.workitem.id.x()
804792
%gep.tid = getelementptr i32, ptr addrspace(1) %ptr, i32 %id
@@ -2302,8 +2290,6 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0(ptr addrspace(1) %out, ptr
23022290
; GFX11-NEXT: s_clause 0x1
23032291
; GFX11-NEXT: global_store_b32 v2, v0, s[2:3]
23042292
; GFX11-NEXT: global_store_b32 v2, v1, s[0:1]
2305-
; GFX11-NEXT: s_nop 0
2306-
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
23072293
; GFX11-NEXT: s_endpgm
23082294
%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #2
23092295
%idx.0 = add nsw i32 %tid.x, 2
@@ -2390,8 +2376,6 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr add
23902376
; GFX11-NEXT: buffer_gl0_inv
23912377
; GFX11-NEXT: v_mov_b32_e32 v2, 0
23922378
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
2393-
; GFX11-NEXT: s_nop 0
2394-
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
23952379
; GFX11-NEXT: s_endpgm
23962380
%result = atomicrmw udec_wrap ptr addrspace(3) %ptr, i64 42 syncscope("agent") seq_cst, align 8
23972381
store i64 %result, ptr addrspace(1) %out, align 4
@@ -2474,8 +2458,6 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64_offset(ptr addrspace(1) %out,
24742458
; GFX11-NEXT: buffer_gl0_inv
24752459
; GFX11-NEXT: v_mov_b32_e32 v2, 0
24762460
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
2477-
; GFX11-NEXT: s_nop 0
2478-
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
24792461
; GFX11-NEXT: s_endpgm
24802462
%gep = getelementptr i64, ptr addrspace(3) %ptr, i32 4
24812463
%result = atomicrmw udec_wrap ptr addrspace(3) %gep, i64 42 syncscope("agent") seq_cst, align 8
@@ -2679,8 +2661,6 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr
26792661
; GFX11-NEXT: buffer_gl1_inv
26802662
; GFX11-NEXT: buffer_gl0_inv
26812663
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
2682-
; GFX11-NEXT: s_nop 0
2683-
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
26842664
; GFX11-NEXT: s_endpgm
26852665
%result = atomicrmw udec_wrap ptr addrspace(1) %ptr, i64 42 syncscope("agent") seq_cst, align 8
26862666
store i64 %result, ptr addrspace(1) %out, align 4
@@ -2762,8 +2742,6 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset(ptr addrspace(1) %ou
27622742
; GFX11-NEXT: buffer_gl1_inv
27632743
; GFX11-NEXT: buffer_gl0_inv
27642744
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
2765-
; GFX11-NEXT: s_nop 0
2766-
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
27672745
; GFX11-NEXT: s_endpgm
27682746
%gep = getelementptr i64, ptr addrspace(1) %ptr, i32 4
27692747
%result = atomicrmw udec_wrap ptr addrspace(1) %gep, i64 42 syncscope("agent") seq_cst, align 8
@@ -2846,8 +2824,6 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_system(ptr addrspace
28462824
; GFX11-NEXT: buffer_gl1_inv
28472825
; GFX11-NEXT: buffer_gl0_inv
28482826
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
2849-
; GFX11-NEXT: s_nop 0
2850-
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
28512827
; GFX11-NEXT: s_endpgm
28522828
%gep = getelementptr i64, ptr addrspace(1) %ptr, i32 4
28532829
%result = atomicrmw udec_wrap ptr addrspace(1) %gep, i64 42 seq_cst, align 8
@@ -3153,8 +3129,6 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_addr64(ptr addrspace
31533129
; GFX11-NEXT: buffer_gl1_inv
31543130
; GFX11-NEXT: buffer_gl0_inv
31553131
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
3156-
; GFX11-NEXT: s_nop 0
3157-
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
31583132
; GFX11-NEXT: s_endpgm
31593133
%id = call i32 @llvm.amdgcn.workitem.id.x()
31603134
%gep.tid = getelementptr i64, ptr addrspace(1) %ptr, i32 %id
@@ -3334,8 +3308,6 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(ptr addrspace(1) %out,
33343308
; GFX11-NEXT: s_clause 0x1
33353309
; GFX11-NEXT: global_store_b32 v3, v2, s[2:3]
33363310
; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1]
3337-
; GFX11-NEXT: s_nop 0
3338-
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
33393311
; GFX11-NEXT: s_endpgm
33403312
%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #2
33413313
%idx.0 = add nsw i32 %tid.x, 2

llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll

Lines changed: 0 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -84,8 +84,6 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr add
8484
; GFX11-NEXT: buffer_gl0_inv
8585
; GFX11-NEXT: v_mov_b32_e32 v1, 0
8686
; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
87-
; GFX11-NEXT: s_nop 0
88-
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
8987
; GFX11-NEXT: s_endpgm
9088
%result = atomicrmw uinc_wrap ptr addrspace(3) %ptr, i32 42 syncscope("agent") seq_cst, align 4
9189
store i32 %result, ptr addrspace(1) %out, align 4
@@ -163,8 +161,6 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32_offset(ptr addrspace(1) %out,
163161
; GFX11-NEXT: buffer_gl0_inv
164162
; GFX11-NEXT: v_mov_b32_e32 v1, 0
165163
; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
166-
; GFX11-NEXT: s_nop 0
167-
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
168164
; GFX11-NEXT: s_endpgm
169165
%gep = getelementptr i32, ptr addrspace(3) %ptr, i32 4
170166
%result = atomicrmw uinc_wrap ptr addrspace(3) %gep, i32 42 syncscope("agent") seq_cst, align 4
@@ -353,8 +349,6 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr
353349
; GFX11-NEXT: buffer_gl1_inv
354350
; GFX11-NEXT: buffer_gl0_inv
355351
; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
356-
; GFX11-NEXT: s_nop 0
357-
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
358352
; GFX11-NEXT: s_endpgm
359353
%result = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i32 42 syncscope("agent") seq_cst, align 4
360354
store i32 %result, ptr addrspace(1) %out, align 4
@@ -431,8 +425,6 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset(ptr addrspace(1) %ou
431425
; GFX11-NEXT: buffer_gl1_inv
432426
; GFX11-NEXT: buffer_gl0_inv
433427
; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
434-
; GFX11-NEXT: s_nop 0
435-
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
436428
; GFX11-NEXT: s_endpgm
437429
%gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4
438430
%result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 42 syncscope("agent") seq_cst, align 4
@@ -510,8 +502,6 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_sistem(ptr addrspace
510502
; GFX11-NEXT: buffer_gl1_inv
511503
; GFX11-NEXT: buffer_gl0_inv
512504
; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
513-
; GFX11-NEXT: s_nop 0
514-
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
515505
; GFX11-NEXT: s_endpgm
516506
%gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4
517507
%result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 42 seq_cst, align 4
@@ -797,8 +787,6 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(ptr addrspace
797787
; GFX11-NEXT: buffer_gl1_inv
798788
; GFX11-NEXT: buffer_gl0_inv
799789
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
800-
; GFX11-NEXT: s_nop 0
801-
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
802790
; GFX11-NEXT: s_endpgm
803791
%id = call i32 @llvm.amdgcn.workitem.id.x()
804792
%gep.tid = getelementptr i32, ptr addrspace(1) %ptr, i32 %id
@@ -967,8 +955,6 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i32(ptr addrspace(1) %out,
967955
; GFX11-NEXT: s_clause 0x1
968956
; GFX11-NEXT: global_store_b32 v2, v0, s[2:3]
969957
; GFX11-NEXT: global_store_b32 v2, v1, s[0:1]
970-
; GFX11-NEXT: s_nop 0
971-
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
972958
; GFX11-NEXT: s_endpgm
973959
%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #2
974960
%idx.0 = add nsw i32 %tid.x, 2
@@ -1055,8 +1041,6 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr add
10551041
; GFX11-NEXT: buffer_gl0_inv
10561042
; GFX11-NEXT: v_mov_b32_e32 v2, 0
10571043
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
1058-
; GFX11-NEXT: s_nop 0
1059-
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
10601044
; GFX11-NEXT: s_endpgm
10611045
%result = atomicrmw uinc_wrap ptr addrspace(3) %ptr, i64 42 syncscope("agent") seq_cst, align 8
10621046
store i64 %result, ptr addrspace(1) %out, align 4
@@ -1139,8 +1123,6 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64_offset(ptr addrspace(1) %out,
11391123
; GFX11-NEXT: buffer_gl0_inv
11401124
; GFX11-NEXT: v_mov_b32_e32 v2, 0
11411125
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
1142-
; GFX11-NEXT: s_nop 0
1143-
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
11441126
; GFX11-NEXT: s_endpgm
11451127
%gep = getelementptr i64, ptr addrspace(3) %ptr, i32 4
11461128
%result = atomicrmw uinc_wrap ptr addrspace(3) %gep, i64 42 syncscope("agent") seq_cst, align 8
@@ -1344,8 +1326,6 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr
13441326
; GFX11-NEXT: buffer_gl1_inv
13451327
; GFX11-NEXT: buffer_gl0_inv
13461328
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
1347-
; GFX11-NEXT: s_nop 0
1348-
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
13491329
; GFX11-NEXT: s_endpgm
13501330
%result = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i64 42 syncscope("agent") seq_cst, align 8
13511331
store i64 %result, ptr addrspace(1) %out, align 4
@@ -1427,8 +1407,6 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset(ptr addrspace(1) %ou
14271407
; GFX11-NEXT: buffer_gl1_inv
14281408
; GFX11-NEXT: buffer_gl0_inv
14291409
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
1430-
; GFX11-NEXT: s_nop 0
1431-
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
14321410
; GFX11-NEXT: s_endpgm
14331411
%gep = getelementptr i64, ptr addrspace(1) %ptr, i32 4
14341412
%result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i64 42 syncscope("agent") seq_cst, align 8
@@ -1511,8 +1489,6 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_system(ptr addrspace
15111489
; GFX11-NEXT: buffer_gl1_inv
15121490
; GFX11-NEXT: buffer_gl0_inv
15131491
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
1514-
; GFX11-NEXT: s_nop 0
1515-
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
15161492
; GFX11-NEXT: s_endpgm
15171493
%gep = getelementptr i64, ptr addrspace(1) %ptr, i32 4
15181494
%result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i64 42 seq_cst, align 8
@@ -1818,8 +1794,6 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(ptr addrspace
18181794
; GFX11-NEXT: buffer_gl1_inv
18191795
; GFX11-NEXT: buffer_gl0_inv
18201796
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
1821-
; GFX11-NEXT: s_nop 0
1822-
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
18231797
; GFX11-NEXT: s_endpgm
18241798
%id = call i32 @llvm.amdgcn.workitem.id.x()
18251799
%gep.tid = getelementptr i64, ptr addrspace(1) %ptr, i32 %id
@@ -2680,8 +2654,6 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i64(ptr addrspace(1) %out,
26802654
; GFX11-NEXT: s_clause 0x1
26812655
; GFX11-NEXT: global_store_b32 v3, v2, s[2:3]
26822656
; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1]
2683-
; GFX11-NEXT: s_nop 0
2684-
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
26852657
; GFX11-NEXT: s_endpgm
26862658
%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #2
26872659
%idx.0 = add nsw i32 %tid.x, 2
@@ -3541,8 +3513,6 @@ define amdgpu_kernel void @nocse_lds_atomic_inc_ret_i32(ptr addrspace(1) %out0,
35413513
; GFX11-NEXT: s_clause 0x1
35423514
; GFX11-NEXT: global_store_b32 v1, v2, s[0:1]
35433515
; GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
3544-
; GFX11-NEXT: s_nop 0
3545-
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
35463516
; GFX11-NEXT: s_endpgm
35473517
%result0 = atomicrmw uinc_wrap ptr addrspace(3) %ptr, i32 42 syncscope("agent") seq_cst, align 4
35483518
%result1 = atomicrmw uinc_wrap ptr addrspace(3) %ptr, i32 42 syncscope("agent") seq_cst, align 4

0 commit comments

Comments
 (0)