Skip to content

Commit 3f8c7e9

Browse files
authored
[AMDGPU] Insert waitcnt for non-global fence release in GFX12 (#159282)
A fence release could be followed by a barrier, so it should wait for the relevant memory accesses to complete, even if it is mmra-limited to LDS. So far, that would be skipped for non-global fence releases. Fixes SWDEV-554932.
1 parent b6c061e commit 3f8c7e9

File tree

3 files changed

+187
-38
lines changed

3 files changed

+187
-38
lines changed

llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp

Lines changed: 38 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -2514,60 +2514,60 @@ bool SIGfx12CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
25142514
SIAtomicAddrSpace AddrSpace,
25152515
bool IsCrossAddrSpaceOrdering,
25162516
Position Pos) const {
2517+
bool Changed = false;
2518+
25172519
MachineBasicBlock &MBB = *MI->getParent();
25182520
DebugLoc DL = MI->getDebugLoc();
25192521

25202522
// The scratch address space does not need the global memory cache
25212523
// writeback as all memory operations by the same thread are
25222524
// sequentially consistent, and no other thread can access scratch
25232525
// memory.
2526+
if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
2527+
if (Pos == Position::AFTER)
2528+
++MI;
25242529

2525-
// Other address spaces do not have a cache.
2526-
if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) == SIAtomicAddrSpace::NONE)
2527-
return false;
2528-
2529-
if (Pos == Position::AFTER)
2530-
++MI;
2531-
2532-
// global_wb is only necessary at system scope for GFX12.0,
2533-
// they're also necessary at device scope for GFX12.5.
2534-
//
2535-
// Emitting it for lower scopes is a slow no-op, so we omit it
2536-
// for performance.
2537-
switch (Scope) {
2538-
case SIAtomicScope::SYSTEM:
2539-
BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_WB))
2540-
.addImm(AMDGPU::CPol::SCOPE_SYS);
2541-
break;
2542-
case SIAtomicScope::AGENT:
2543-
// TODO DOCS
2544-
if (ST.hasGFX1250Insts()) {
2530+
// global_wb is only necessary at system scope for GFX12.0,
2531+
// they're also necessary at device scope for GFX12.5.
2532+
//
2533+
// Emitting it for lower scopes is a slow no-op, so we omit it
2534+
// for performance.
2535+
switch (Scope) {
2536+
case SIAtomicScope::SYSTEM:
25452537
BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_WB))
2546-
.addImm(AMDGPU::CPol::SCOPE_DEV);
2538+
.addImm(AMDGPU::CPol::SCOPE_SYS);
2539+
Changed = true;
2540+
break;
2541+
case SIAtomicScope::AGENT:
2542+
// TODO DOCS
2543+
if (ST.hasGFX1250Insts()) {
2544+
BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_WB))
2545+
.addImm(AMDGPU::CPol::SCOPE_DEV);
2546+
Changed = true;
2547+
}
2548+
break;
2549+
case SIAtomicScope::CLUSTER:
2550+
case SIAtomicScope::WORKGROUP:
2551+
// No WB necessary, but we still have to wait.
2552+
case SIAtomicScope::WAVEFRONT:
2553+
case SIAtomicScope::SINGLETHREAD:
2554+
// No WB or wait necessary here, but insertWait takes care of that.
2555+
break;
2556+
default:
2557+
llvm_unreachable("Unsupported synchronization scope");
25472558
}
2548-
break;
2549-
case SIAtomicScope::CLUSTER:
2550-
case SIAtomicScope::WORKGROUP:
2551-
// No WB necessary, but we still have to wait.
2552-
break;
2553-
case SIAtomicScope::WAVEFRONT:
2554-
case SIAtomicScope::SINGLETHREAD:
2555-
// No WB or wait necessary here.
2556-
return false;
2557-
default:
2558-
llvm_unreachable("Unsupported synchronization scope");
2559-
}
25602559

2561-
if (Pos == Position::AFTER)
2562-
--MI;
2560+
if (Pos == Position::AFTER)
2561+
--MI;
2562+
}
25632563

25642564
// We always have to wait for previous memory operations (load/store) to
25652565
// complete, whether we inserted a WB or not. If we inserted a WB (storecnt),
25662566
// we of course need to wait for that as well.
2567-
insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
2568-
IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release);
2567+
Changed |= insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
2568+
IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release);
25692569

2570-
return true;
2570+
return Changed;
25712571
}
25722572

25732573
bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10-WGP %s
3+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11-WGP %s
4+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s
5+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250 %s
6+
7+
8+
define float @test_barrier_workgroup_local_mmra(ptr addrspace(3) noundef %x, ptr addrspace(3) noundef %y, float %val) {
9+
; GFX10-WGP-LABEL: test_barrier_workgroup_local_mmra:
10+
; GFX10-WGP: ; %bb.0:
11+
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12+
; GFX10-WGP-NEXT: ds_write_b32 v0, v2
13+
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
14+
; GFX10-WGP-NEXT: s_barrier
15+
; GFX10-WGP-NEXT: ds_read_b32 v0, v1
16+
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
17+
; GFX10-WGP-NEXT: s_setpc_b64 s[30:31]
18+
;
19+
; GFX11-WGP-LABEL: test_barrier_workgroup_local_mmra:
20+
; GFX11-WGP: ; %bb.0:
21+
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
22+
; GFX11-WGP-NEXT: ds_store_b32 v0, v2
23+
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
24+
; GFX11-WGP-NEXT: s_barrier
25+
; GFX11-WGP-NEXT: ds_load_b32 v0, v1
26+
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
27+
; GFX11-WGP-NEXT: s_setpc_b64 s[30:31]
28+
;
29+
; GFX12-WGP-LABEL: test_barrier_workgroup_local_mmra:
30+
; GFX12-WGP: ; %bb.0:
31+
; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
32+
; GFX12-WGP-NEXT: s_wait_expcnt 0x0
33+
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
34+
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
35+
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
36+
; GFX12-WGP-NEXT: ds_store_b32 v0, v2
37+
; GFX12-WGP-NEXT: s_wait_dscnt 0x0
38+
; GFX12-WGP-NEXT: s_barrier_signal -1
39+
; GFX12-WGP-NEXT: s_barrier_wait -1
40+
; GFX12-WGP-NEXT: ds_load_b32 v0, v1
41+
; GFX12-WGP-NEXT: s_wait_dscnt 0x0
42+
; GFX12-WGP-NEXT: s_setpc_b64 s[30:31]
43+
;
44+
; GFX1250-LABEL: test_barrier_workgroup_local_mmra:
45+
; GFX1250: ; %bb.0:
46+
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
47+
; GFX1250-NEXT: s_wait_kmcnt 0x0
48+
; GFX1250-NEXT: ds_store_b32 v0, v2
49+
; GFX1250-NEXT: s_wait_dscnt 0x0
50+
; GFX1250-NEXT: s_barrier_signal -1
51+
; GFX1250-NEXT: s_barrier_wait -1
52+
; GFX1250-NEXT: ds_load_b32 v0, v1
53+
; GFX1250-NEXT: s_wait_dscnt 0x0
54+
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
55+
store float %val, ptr addrspace(3) %x
56+
fence syncscope("workgroup") release, !mmra !0
57+
tail call void @llvm.amdgcn.s.barrier()
58+
fence syncscope("workgroup") acquire, !mmra !0
59+
%ret = load float, ptr addrspace(3) %y
60+
ret float %ret
61+
}
62+
63+
define float @test_barrier_workgroup_global_mmra(ptr addrspace(1) noundef %x, ptr addrspace(1) noundef %y, float %val) {
64+
; GFX10-WGP-LABEL: test_barrier_workgroup_global_mmra:
65+
; GFX10-WGP: ; %bb.0:
66+
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
67+
; GFX10-WGP-NEXT: global_store_dword v[0:1], v4, off
68+
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
69+
; GFX10-WGP-NEXT: s_barrier
70+
; GFX10-WGP-NEXT: buffer_gl0_inv
71+
; GFX10-WGP-NEXT: global_load_dword v0, v[2:3], off
72+
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
73+
; GFX10-WGP-NEXT: s_setpc_b64 s[30:31]
74+
;
75+
; GFX11-WGP-LABEL: test_barrier_workgroup_global_mmra:
76+
; GFX11-WGP: ; %bb.0:
77+
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
78+
; GFX11-WGP-NEXT: global_store_b32 v[0:1], v4, off
79+
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
80+
; GFX11-WGP-NEXT: s_barrier
81+
; GFX11-WGP-NEXT: buffer_gl0_inv
82+
; GFX11-WGP-NEXT: global_load_b32 v0, v[2:3], off
83+
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
84+
; GFX11-WGP-NEXT: s_setpc_b64 s[30:31]
85+
;
86+
; GFX12-WGP-LABEL: test_barrier_workgroup_global_mmra:
87+
; GFX12-WGP: ; %bb.0:
88+
; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
89+
; GFX12-WGP-NEXT: s_wait_expcnt 0x0
90+
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
91+
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
92+
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
93+
; GFX12-WGP-NEXT: global_store_b32 v[0:1], v4, off
94+
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
95+
; GFX12-WGP-NEXT: s_barrier_signal -1
96+
; GFX12-WGP-NEXT: s_barrier_wait -1
97+
; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE
98+
; GFX12-WGP-NEXT: global_load_b32 v0, v[2:3], off
99+
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
100+
; GFX12-WGP-NEXT: s_setpc_b64 s[30:31]
101+
;
102+
; GFX1250-LABEL: test_barrier_workgroup_global_mmra:
103+
; GFX1250: ; %bb.0:
104+
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
105+
; GFX1250-NEXT: s_wait_kmcnt 0x0
106+
; GFX1250-NEXT: global_store_b32 v[0:1], v4, off
107+
; GFX1250-NEXT: s_wait_storecnt 0x0
108+
; GFX1250-NEXT: s_barrier_signal -1
109+
; GFX1250-NEXT: s_barrier_wait -1
110+
; GFX1250-NEXT: global_load_b32 v0, v[2:3], off
111+
; GFX1250-NEXT: s_wait_loadcnt 0x0
112+
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
113+
store float %val, ptr addrspace(1) %x
114+
fence syncscope("workgroup") release, !mmra !1
115+
tail call void @llvm.amdgcn.s.barrier()
116+
fence syncscope("workgroup") acquire, !mmra !1
117+
%ret = load float, ptr addrspace(1) %y
118+
ret float %ret
119+
}
120+
121+
!0 = !{!"amdgpu-synchronize-as", !"local"}
122+
!1 = !{!"amdgpu-synchronize-as", !"global"}

llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-local.ll

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -143,14 +143,17 @@ define amdgpu_kernel void @workgroup_release_fence() {
143143
;
144144
; GFX12-WGP-LABEL: workgroup_release_fence:
145145
; GFX12-WGP: ; %bb.0: ; %entry
146+
; GFX12-WGP-NEXT: s_wait_dscnt 0x0
146147
; GFX12-WGP-NEXT: s_endpgm
147148
;
148149
; GFX12-CU-LABEL: workgroup_release_fence:
149150
; GFX12-CU: ; %bb.0: ; %entry
151+
; GFX12-CU-NEXT: s_wait_dscnt 0x0
150152
; GFX12-CU-NEXT: s_endpgm
151153
;
152154
; GFX1250-LABEL: workgroup_release_fence:
153155
; GFX1250: ; %bb.0: ; %entry
156+
; GFX1250-NEXT: s_wait_dscnt 0x0
154157
; GFX1250-NEXT: s_endpgm
155158
entry:
156159
fence syncscope("workgroup") release, !mmra !{!"amdgpu-synchronize-as", !"local"}
@@ -213,14 +216,17 @@ define amdgpu_kernel void @workgroup_acq_rel_fence() {
213216
;
214217
; GFX12-WGP-LABEL: workgroup_acq_rel_fence:
215218
; GFX12-WGP: ; %bb.0: ; %entry
219+
; GFX12-WGP-NEXT: s_wait_dscnt 0x0
216220
; GFX12-WGP-NEXT: s_endpgm
217221
;
218222
; GFX12-CU-LABEL: workgroup_acq_rel_fence:
219223
; GFX12-CU: ; %bb.0: ; %entry
224+
; GFX12-CU-NEXT: s_wait_dscnt 0x0
220225
; GFX12-CU-NEXT: s_endpgm
221226
;
222227
; GFX1250-LABEL: workgroup_acq_rel_fence:
223228
; GFX1250: ; %bb.0: ; %entry
229+
; GFX1250-NEXT: s_wait_dscnt 0x0
224230
; GFX1250-NEXT: s_endpgm
225231
entry:
226232
fence syncscope("workgroup") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"local"}
@@ -283,14 +289,17 @@ define amdgpu_kernel void @workgroup_seq_cst_fence() {
283289
;
284290
; GFX12-WGP-LABEL: workgroup_seq_cst_fence:
285291
; GFX12-WGP: ; %bb.0: ; %entry
292+
; GFX12-WGP-NEXT: s_wait_dscnt 0x0
286293
; GFX12-WGP-NEXT: s_endpgm
287294
;
288295
; GFX12-CU-LABEL: workgroup_seq_cst_fence:
289296
; GFX12-CU: ; %bb.0: ; %entry
297+
; GFX12-CU-NEXT: s_wait_dscnt 0x0
290298
; GFX12-CU-NEXT: s_endpgm
291299
;
292300
; GFX1250-LABEL: workgroup_seq_cst_fence:
293301
; GFX1250: ; %bb.0: ; %entry
302+
; GFX1250-NEXT: s_wait_dscnt 0x0
294303
; GFX1250-NEXT: s_endpgm
295304
entry:
296305
fence syncscope("workgroup") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"local"}
@@ -670,14 +679,17 @@ define amdgpu_kernel void @agent_release_fence() {
670679
;
671680
; GFX12-WGP-LABEL: agent_release_fence:
672681
; GFX12-WGP: ; %bb.0: ; %entry
682+
; GFX12-WGP-NEXT: s_wait_dscnt 0x0
673683
; GFX12-WGP-NEXT: s_endpgm
674684
;
675685
; GFX12-CU-LABEL: agent_release_fence:
676686
; GFX12-CU: ; %bb.0: ; %entry
687+
; GFX12-CU-NEXT: s_wait_dscnt 0x0
677688
; GFX12-CU-NEXT: s_endpgm
678689
;
679690
; GFX1250-LABEL: agent_release_fence:
680691
; GFX1250: ; %bb.0: ; %entry
692+
; GFX1250-NEXT: s_wait_dscnt 0x0
681693
; GFX1250-NEXT: s_endpgm
682694
entry:
683695
fence syncscope("agent") release, !mmra !{!"amdgpu-synchronize-as", !"local"}
@@ -740,14 +752,17 @@ define amdgpu_kernel void @agent_acq_rel_fence() {
740752
;
741753
; GFX12-WGP-LABEL: agent_acq_rel_fence:
742754
; GFX12-WGP: ; %bb.0: ; %entry
755+
; GFX12-WGP-NEXT: s_wait_dscnt 0x0
743756
; GFX12-WGP-NEXT: s_endpgm
744757
;
745758
; GFX12-CU-LABEL: agent_acq_rel_fence:
746759
; GFX12-CU: ; %bb.0: ; %entry
760+
; GFX12-CU-NEXT: s_wait_dscnt 0x0
747761
; GFX12-CU-NEXT: s_endpgm
748762
;
749763
; GFX1250-LABEL: agent_acq_rel_fence:
750764
; GFX1250: ; %bb.0: ; %entry
765+
; GFX1250-NEXT: s_wait_dscnt 0x0
751766
; GFX1250-NEXT: s_endpgm
752767
entry:
753768
fence syncscope("agent") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"local"}
@@ -810,14 +825,17 @@ define amdgpu_kernel void @agent_seq_cst_fence() {
810825
;
811826
; GFX12-WGP-LABEL: agent_seq_cst_fence:
812827
; GFX12-WGP: ; %bb.0: ; %entry
828+
; GFX12-WGP-NEXT: s_wait_dscnt 0x0
813829
; GFX12-WGP-NEXT: s_endpgm
814830
;
815831
; GFX12-CU-LABEL: agent_seq_cst_fence:
816832
; GFX12-CU: ; %bb.0: ; %entry
833+
; GFX12-CU-NEXT: s_wait_dscnt 0x0
817834
; GFX12-CU-NEXT: s_endpgm
818835
;
819836
; GFX1250-LABEL: agent_seq_cst_fence:
820837
; GFX1250: ; %bb.0: ; %entry
838+
; GFX1250-NEXT: s_wait_dscnt 0x0
821839
; GFX1250-NEXT: s_endpgm
822840
entry:
823841
fence syncscope("agent") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"local"}
@@ -1197,14 +1215,17 @@ define amdgpu_kernel void @system_release_fence() {
11971215
;
11981216
; GFX12-WGP-LABEL: system_release_fence:
11991217
; GFX12-WGP: ; %bb.0: ; %entry
1218+
; GFX12-WGP-NEXT: s_wait_dscnt 0x0
12001219
; GFX12-WGP-NEXT: s_endpgm
12011220
;
12021221
; GFX12-CU-LABEL: system_release_fence:
12031222
; GFX12-CU: ; %bb.0: ; %entry
1223+
; GFX12-CU-NEXT: s_wait_dscnt 0x0
12041224
; GFX12-CU-NEXT: s_endpgm
12051225
;
12061226
; GFX1250-LABEL: system_release_fence:
12071227
; GFX1250: ; %bb.0: ; %entry
1228+
; GFX1250-NEXT: s_wait_dscnt 0x0
12081229
; GFX1250-NEXT: s_endpgm
12091230
entry:
12101231
fence release, !mmra !{!"amdgpu-synchronize-as", !"local"}
@@ -1267,14 +1288,17 @@ define amdgpu_kernel void @system_acq_rel_fence() {
12671288
;
12681289
; GFX12-WGP-LABEL: system_acq_rel_fence:
12691290
; GFX12-WGP: ; %bb.0: ; %entry
1291+
; GFX12-WGP-NEXT: s_wait_dscnt 0x0
12701292
; GFX12-WGP-NEXT: s_endpgm
12711293
;
12721294
; GFX12-CU-LABEL: system_acq_rel_fence:
12731295
; GFX12-CU: ; %bb.0: ; %entry
1296+
; GFX12-CU-NEXT: s_wait_dscnt 0x0
12741297
; GFX12-CU-NEXT: s_endpgm
12751298
;
12761299
; GFX1250-LABEL: system_acq_rel_fence:
12771300
; GFX1250: ; %bb.0: ; %entry
1301+
; GFX1250-NEXT: s_wait_dscnt 0x0
12781302
; GFX1250-NEXT: s_endpgm
12791303
entry:
12801304
fence acq_rel, !mmra !{!"amdgpu-synchronize-as", !"local"}
@@ -1337,14 +1361,17 @@ define amdgpu_kernel void @system_seq_cst_fence() {
13371361
;
13381362
; GFX12-WGP-LABEL: system_seq_cst_fence:
13391363
; GFX12-WGP: ; %bb.0: ; %entry
1364+
; GFX12-WGP-NEXT: s_wait_dscnt 0x0
13401365
; GFX12-WGP-NEXT: s_endpgm
13411366
;
13421367
; GFX12-CU-LABEL: system_seq_cst_fence:
13431368
; GFX12-CU: ; %bb.0: ; %entry
1369+
; GFX12-CU-NEXT: s_wait_dscnt 0x0
13441370
; GFX12-CU-NEXT: s_endpgm
13451371
;
13461372
; GFX1250-LABEL: system_seq_cst_fence:
13471373
; GFX1250: ; %bb.0: ; %entry
1374+
; GFX1250-NEXT: s_wait_dscnt 0x0
13481375
; GFX1250-NEXT: s_endpgm
13491376
entry:
13501377
fence seq_cst, !mmra !{!"amdgpu-synchronize-as", !"local"}

0 commit comments

Comments
 (0)