Skip to content

Commit 18f1c98

Browse files
authored
[AMDGPU] Avoid unneeded waitcounts before spill stores (#108303)
Implicit defs and uses on spill stores were accounted as real defs and uses, while only exist for liveness accounting. As a result unneded waits were generated. Fixes: SWDEV-484177
1 parent 918972b commit 18f1c98

19 files changed

+566
-729
lines changed

llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -901,7 +901,16 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
901901
}
902902
} else /* LGKM_CNT || EXP_CNT || VS_CNT || NUM_INST_CNTS */ {
903903
// Match the score to the destination registers.
904-
for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
904+
//
905+
// Check only explicit operands. Stores, especially spill stores, include
906+
// implicit uses and defs of their super registers which would create an
907+
// artificial dependency, while these are there only for register liveness
908+
// accounting purposes.
909+
//
910+
// Special cases where implicit register defs and uses exists, such as
911+
// M0, FLAT_SCR or VCC, but the wait will be generated earlier in the
912+
// generateWaitcntInstBefore() if that was loaded from memory.
913+
for (unsigned I = 0, E = Inst.getNumExplicitOperands(); I != E; ++I) {
905914
auto &Op = Inst.getOperand(I);
906915
if (!Op.isReg() || !Op.isDef())
907916
continue;

llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -268,7 +268,7 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(
268268
; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
269269
; GFX906-NEXT: s_waitcnt vmcnt(0)
270270
; GFX906-NEXT: buffer_store_dword v5, off, s[12:15], 0 ; 4-byte Folded Spill
271-
; GFX906-NEXT: s_waitcnt vmcnt(0)
271+
; GFX906-NEXT: s_nop 0
272272
; GFX906-NEXT: buffer_store_dword v6, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
273273
; GFX906-NEXT: buffer_store_dword v7, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
274274
; GFX906-NEXT: buffer_store_dword v8, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill
@@ -294,7 +294,7 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(
294294
; GFX906-NEXT: global_load_dwordx4 v[0:3], v4, s[6:7]
295295
; GFX906-NEXT: s_waitcnt vmcnt(0)
296296
; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
297-
; GFX906-NEXT: s_waitcnt vmcnt(0)
297+
; GFX906-NEXT: s_nop 0
298298
; GFX906-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
299299
; GFX906-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
300300
; GFX906-NEXT: buffer_store_dword v3, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill
@@ -317,7 +317,7 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(
317317
; GFX906-NEXT: s_or_b64 exec, exec, s[2:3]
318318
; GFX906-NEXT: s_waitcnt vmcnt(0)
319319
; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill
320-
; GFX906-NEXT: s_waitcnt vmcnt(0)
320+
; GFX906-NEXT: s_nop 0
321321
; GFX906-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill
322322
; GFX906-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill
323323
; GFX906-NEXT: buffer_store_dword v3, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill

llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -185,7 +185,7 @@ define void @no_free_vgprs_at_agpr_to_agpr_copy(float %v0, float %v1) #0 {
185185
; GFX90A-NEXT: s_nop 7
186186
; GFX90A-NEXT: s_nop 2
187187
; GFX90A-NEXT: buffer_store_dword a0, off, s[0:3], s32 ; 4-byte Folded Spill
188-
; GFX90A-NEXT: s_waitcnt vmcnt(0)
188+
; GFX90A-NEXT: s_nop 0
189189
; GFX90A-NEXT: buffer_store_dword a1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
190190
; GFX90A-NEXT: buffer_store_dword a2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
191191
; GFX90A-NEXT: buffer_store_dword a3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
@@ -215,7 +215,6 @@ define void @no_free_vgprs_at_agpr_to_agpr_copy(float %v0, float %v1) #0 {
215215
; GFX90A-NEXT: buffer_load_dword a7, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
216216
; GFX90A-NEXT: buffer_load_dword a8, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
217217
; GFX90A-NEXT: buffer_load_dword a9, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
218-
; GFX90A-NEXT: s_waitcnt vmcnt(9)
219218
; GFX90A-NEXT: v_accvgpr_write_b32 a10, v39 ; Reload Reuse
220219
; GFX90A-NEXT: v_accvgpr_write_b32 a11, v38 ; Reload Reuse
221220
; GFX90A-NEXT: v_accvgpr_write_b32 a12, v37 ; Reload Reuse
@@ -1093,7 +1092,7 @@ define void @no_free_vgprs_at_sgpr_to_agpr_copy(float %v0, float %v1) #0 {
10931092
; GFX90A-NEXT: s_nop 7
10941093
; GFX90A-NEXT: s_nop 2
10951094
; GFX90A-NEXT: buffer_store_dword a0, off, s[0:3], s32 ; 4-byte Folded Spill
1096-
; GFX90A-NEXT: s_waitcnt vmcnt(0)
1095+
; GFX90A-NEXT: s_nop 0
10971096
; GFX90A-NEXT: buffer_store_dword a1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
10981097
; GFX90A-NEXT: buffer_store_dword a2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
10991098
; GFX90A-NEXT: buffer_store_dword a3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
@@ -1124,7 +1123,6 @@ define void @no_free_vgprs_at_sgpr_to_agpr_copy(float %v0, float %v1) #0 {
11241123
; GFX90A-NEXT: buffer_load_dword a8, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
11251124
; GFX90A-NEXT: buffer_load_dword a9, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
11261125
; GFX90A-NEXT: buffer_load_dword a10, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
1127-
; GFX90A-NEXT: s_waitcnt vmcnt(10)
11281126
; GFX90A-NEXT: v_accvgpr_write_b32 a11, v39 ; Reload Reuse
11291127
; GFX90A-NEXT: v_accvgpr_write_b32 a12, v38 ; Reload Reuse
11301128
; GFX90A-NEXT: v_accvgpr_write_b32 a13, v37 ; Reload Reuse

llvm/test/CodeGen/AMDGPU/collapse-endcf.ll

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -708,7 +708,6 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) {
708708
; GCN-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
709709
; GCN-O0-NEXT: v_mov_b32_e32 v6, v2
710710
; GCN-O0-NEXT: buffer_store_dword v5, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
711-
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
712711
; GCN-O0-NEXT: buffer_store_dword v6, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
713712
; GCN-O0-NEXT: s_mov_b32 s1, 0xf000
714713
; GCN-O0-NEXT: s_mov_b32 s2, 0
@@ -722,6 +721,7 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) {
722721
; GCN-O0-NEXT: s_mov_b64 s[2:3], exec
723722
; GCN-O0-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1]
724723
; GCN-O0-NEXT: s_xor_b64 s[2:3], s[0:1], s[2:3]
724+
; GCN-O0-NEXT: s_waitcnt vmcnt(4)
725725
; GCN-O0-NEXT: v_writelane_b32 v0, s2, 0
726726
; GCN-O0-NEXT: v_writelane_b32 v0, s3, 1
727727
; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1
@@ -1159,7 +1159,6 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 {
11591159
; GCN-O0-NEXT: v_mov_b32_e32 v3, s10
11601160
; GCN-O0-NEXT: v_mov_b32_e32 v4, s11
11611161
; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
1162-
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
11631162
; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
11641163
; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
11651164
; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
@@ -1193,7 +1192,6 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 {
11931192
; GCN-O0-NEXT: v_mov_b32_e32 v3, s10
11941193
; GCN-O0-NEXT: v_mov_b32_e32 v4, s11
11951194
; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
1196-
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
11971195
; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
11981196
; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
11991197
; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
@@ -1225,7 +1223,6 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 {
12251223
; GCN-O0-NEXT: v_mov_b32_e32 v2, s6
12261224
; GCN-O0-NEXT: v_mov_b32_e32 v3, s7
12271225
; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
1228-
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
12291226
; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
12301227
; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
12311228
; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
@@ -1247,7 +1244,6 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 {
12471244
; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
12481245
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
12491246
; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
1250-
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
12511247
; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
12521248
; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
12531249
; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
@@ -1269,7 +1265,6 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 {
12691265
; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
12701266
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
12711267
; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
1272-
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
12731268
; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
12741269
; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
12751270
; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
@@ -1343,7 +1338,6 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 {
13431338
; GCN-O0-NEXT: s_mov_b64 exec, s[14:15]
13441339
; GCN-O0-NEXT: s_waitcnt vmcnt(1)
13451340
; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
1346-
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
13471341
; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
13481342
; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
13491343
; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill

0 commit comments

Comments
 (0)