Skip to content

Commit d6edc1a

Browse files
authored
[AMDGPU] Reenable BackOffBarrier on GFX11/12 (#155370)
Re-enable it by adding a wait on vm_vsrc before every barrier "start" instruction in GFX10/11/12 CU mode. This is a less strong wait than what we do without BackOffBarrier, thus this shouldn't introduce any new guarantees that can be abused, instead it relaxes the guarantees we have now to the bare minimum needed to support the behavior users want (fence release + barrier works). There is an exact memory model in the works which will be documented separately.
1 parent 8f59a94 commit d6edc1a

File tree

8 files changed

+229
-9
lines changed

8 files changed

+229
-9
lines changed

llvm/lib/Target/AMDGPU/AMDGPU.td

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1902,6 +1902,7 @@ def FeatureISAVersion10_3_Generic: FeatureSet<
19021902

19031903
def FeatureISAVersion11_Common : FeatureSet<
19041904
[FeatureGFX11,
1905+
FeatureBackOffBarrier,
19051906
FeatureLDSBankCount32,
19061907
FeatureDLInsts,
19071908
FeatureDot5Insts,
@@ -1985,6 +1986,7 @@ def FeatureISAVersion11_5_3 : FeatureSet<
19851986

19861987
def FeatureISAVersion12 : FeatureSet<
19871988
[FeatureGFX12,
1989+
FeatureBackOffBarrier,
19881990
FeatureAddressableLocalMemorySize65536,
19891991
FeatureLDSBankCount32,
19901992
FeatureDLInsts,

llvm/lib/Target/AMDGPU/SIInstrInfo.h

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -983,13 +983,19 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
983983
return MI.getDesc().TSFlags & SIInstrFlags::IsNeverUniform;
984984
}
985985

986-
bool isBarrier(unsigned Opcode) const {
986+
// Check to see if opcode is for a barrier start. Pre gfx12 this is just the
987+
// S_BARRIER, but after support for S_BARRIER_SIGNAL* / S_BARRIER_WAIT we want
988+
// to check for the barrier start (S_BARRIER_SIGNAL*)
989+
bool isBarrierStart(unsigned Opcode) const {
987990
return Opcode == AMDGPU::S_BARRIER ||
988991
Opcode == AMDGPU::S_BARRIER_SIGNAL_M0 ||
989992
Opcode == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0 ||
990993
Opcode == AMDGPU::S_BARRIER_SIGNAL_IMM ||
991-
Opcode == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM ||
992-
Opcode == AMDGPU::S_BARRIER_WAIT ||
994+
Opcode == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM;
995+
}
996+
997+
bool isBarrier(unsigned Opcode) const {
998+
return isBarrierStart(Opcode) || Opcode == AMDGPU::S_BARRIER_WAIT ||
993999
Opcode == AMDGPU::S_BARRIER_INIT_M0 ||
9941000
Opcode == AMDGPU::S_BARRIER_INIT_IMM ||
9951001
Opcode == AMDGPU::S_BARRIER_JOIN_IMM ||

llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -359,6 +359,12 @@ class SICacheControl {
359359
bool IsCrossAddrSpaceOrdering,
360360
Position Pos) const = 0;
361361

362+
/// Inserts any necessary instructions before the barrier start instruction
363+
/// \p MI in order to support pairing of barriers and fences.
364+
virtual bool insertBarrierStart(MachineBasicBlock::iterator &MI) const {
365+
return false;
366+
};
367+
362368
/// Virtual destructor to allow derivations to be deleted.
363369
virtual ~SICacheControl() = default;
364370
};
@@ -547,6 +553,8 @@ class SIGfx10CacheControl : public SIGfx7CacheControl {
547553
SIAtomicScope Scope,
548554
SIAtomicAddrSpace AddrSpace,
549555
Position Pos) const override;
556+
557+
bool insertBarrierStart(MachineBasicBlock::iterator &MI) const override;
550558
};
551559

552560
class SIGfx11CacheControl : public SIGfx10CacheControl {
@@ -2169,6 +2177,21 @@ bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
21692177
return Changed;
21702178
}
21712179

2180+
bool SIGfx10CacheControl::insertBarrierStart(
2181+
MachineBasicBlock::iterator &MI) const {
2182+
// We need to wait on vm_vsrc so barriers can pair with fences in GFX10+ CU
2183+
// mode. This is because a CU mode release fence does not emit any wait, which
2184+
// is fine when only dealing with vmem, but isn't sufficient in the presence
2185+
// of barriers which do not go through vmem.
2186+
if (!ST.isCuModeEnabled())
2187+
return false;
2188+
2189+
BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
2190+
TII->get(AMDGPU::S_WAITCNT_DEPCTR))
2191+
.addImm(AMDGPU::DepCtr::encodeFieldVmVsrc(0));
2192+
return true;
2193+
}
2194+
21722195
bool SIGfx11CacheControl::enableLoadCacheBypass(
21732196
const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
21742197
SIAtomicAddrSpace AddrSpace) const {
@@ -2840,7 +2863,8 @@ bool SIMemoryLegalizer::run(MachineFunction &MF) {
28402863
bool Changed = false;
28412864

28422865
SIMemOpAccess MOA(MMI.getObjFileInfo<AMDGPUMachineModuleInfo>());
2843-
CC = SICacheControl::create(MF.getSubtarget<GCNSubtarget>());
2866+
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2867+
CC = SICacheControl::create(ST);
28442868

28452869
for (auto &MBB : MF) {
28462870
for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) {
@@ -2860,6 +2884,11 @@ bool SIMemoryLegalizer::run(MachineFunction &MF) {
28602884
MI = II->getIterator();
28612885
}
28622886

2887+
if (ST.getInstrInfo()->isBarrierStart(MI->getOpcode())) {
2888+
Changed |= CC->insertBarrierStart(MI);
2889+
continue;
2890+
}
2891+
28632892
if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic))
28642893
continue;
28652894

llvm/test/CodeGen/AMDGPU/back-off-barrier-subtarget-feature.ll

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -54,9 +54,8 @@ define void @back_off_barrier_no_fence(ptr %in, ptr %out) #0 {
5454
; GFX11-BACKOFF: ; %bb.0:
5555
; GFX11-BACKOFF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5656
; GFX11-BACKOFF-NEXT: flat_load_b32 v0, v[0:1]
57-
; GFX11-BACKOFF-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
58-
; GFX11-BACKOFF-NEXT: s_waitcnt_vscnt null, 0x0
5957
; GFX11-BACKOFF-NEXT: s_barrier
58+
; GFX11-BACKOFF-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6059
; GFX11-BACKOFF-NEXT: flat_store_b32 v[2:3], v0
6160
; GFX11-BACKOFF-NEXT: s_waitcnt lgkmcnt(0)
6261
; GFX11-BACKOFF-NEXT: s_setpc_b64 s[30:31]

llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -150,6 +150,7 @@ define amdgpu_kernel void @barrier_release(<4 x i32> inreg %rsrc,
150150
; GFX10CU-NEXT: buffer_load_dword v0, s[8:11], 0 offen lds
151151
; GFX10CU-NEXT: v_mov_b32_e32 v0, s13
152152
; GFX10CU-NEXT: s_waitcnt vmcnt(0)
153+
; GFX10CU-NEXT: s_waitcnt_depctr 0xffe3
153154
; GFX10CU-NEXT: s_barrier
154155
; GFX10CU-NEXT: ds_read_b32 v0, v0
155156
; GFX10CU-NEXT: s_waitcnt lgkmcnt(0)

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -213,9 +213,9 @@ define weak_odr amdgpu_kernel void @dpp_test1(ptr %arg) local_unnamed_addr {
213213
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
214214
; GFX11-NEXT: v_and_b32_e32 v0, 0xffc, v0
215215
; GFX11-NEXT: ds_load_b32 v1, v0
216-
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
217216
; GFX11-NEXT: s_barrier
218217
; GFX11-NEXT: buffer_gl0_inv
218+
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
219219
; GFX11-NEXT: v_add_co_u32 v0, s0, s0, v0
220220
; GFX11-NEXT: v_add_nc_u32_e32 v1, v1, v1
221221
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
Lines changed: 183 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,183 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10-WGP %s
3+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX10-CU %s
4+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11-WGP %s
5+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
6+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s
7+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s
8+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250 %s
9+
10+
define amdgpu_kernel void @test_s_barrier() {
11+
; GFX10-WGP-LABEL: test_s_barrier:
12+
; GFX10-WGP: ; %bb.0: ; %entry
13+
; GFX10-WGP-NEXT: s_barrier
14+
; GFX10-WGP-NEXT: s_endpgm
15+
;
16+
; GFX10-CU-LABEL: test_s_barrier:
17+
; GFX10-CU: ; %bb.0: ; %entry
18+
; GFX10-CU-NEXT: s_waitcnt_depctr 0xffe3
19+
; GFX10-CU-NEXT: s_barrier
20+
; GFX10-CU-NEXT: s_endpgm
21+
;
22+
; GFX11-WGP-LABEL: test_s_barrier:
23+
; GFX11-WGP: ; %bb.0: ; %entry
24+
; GFX11-WGP-NEXT: s_barrier
25+
; GFX11-WGP-NEXT: s_endpgm
26+
;
27+
; GFX11-CU-LABEL: test_s_barrier:
28+
; GFX11-CU: ; %bb.0: ; %entry
29+
; GFX11-CU-NEXT: s_waitcnt_depctr 0xffe3
30+
; GFX11-CU-NEXT: s_barrier
31+
; GFX11-CU-NEXT: s_endpgm
32+
;
33+
; GFX12-WGP-LABEL: test_s_barrier:
34+
; GFX12-WGP: ; %bb.0: ; %entry
35+
; GFX12-WGP-NEXT: s_barrier_signal -1
36+
; GFX12-WGP-NEXT: s_barrier_wait -1
37+
; GFX12-WGP-NEXT: s_endpgm
38+
;
39+
; GFX12-CU-LABEL: test_s_barrier:
40+
; GFX12-CU: ; %bb.0: ; %entry
41+
; GFX12-CU-NEXT: s_wait_alu 0xffe3
42+
; GFX12-CU-NEXT: s_barrier_signal -1
43+
; GFX12-CU-NEXT: s_barrier_wait -1
44+
; GFX12-CU-NEXT: s_endpgm
45+
;
46+
; GFX1250-LABEL: test_s_barrier:
47+
; GFX1250: ; %bb.0: ; %entry
48+
; GFX1250-NEXT: s_wait_alu 0xffe3
49+
; GFX1250-NEXT: s_barrier_signal -1
50+
; GFX1250-NEXT: s_barrier_wait -1
51+
; GFX1250-NEXT: s_endpgm
52+
entry:
53+
call void @llvm.amdgcn.s.barrier()
54+
ret void
55+
}
56+
57+
define amdgpu_kernel void @test_s_barrier_workgroup_fence() {
58+
; GFX10-WGP-LABEL: test_s_barrier_workgroup_fence:
59+
; GFX10-WGP: ; %bb.0: ; %entry
60+
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
61+
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
62+
; GFX10-WGP-NEXT: s_barrier
63+
; GFX10-WGP-NEXT: s_endpgm
64+
;
65+
; GFX10-CU-LABEL: test_s_barrier_workgroup_fence:
66+
; GFX10-CU: ; %bb.0: ; %entry
67+
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
68+
; GFX10-CU-NEXT: s_waitcnt_depctr 0xffe3
69+
; GFX10-CU-NEXT: s_barrier
70+
; GFX10-CU-NEXT: s_endpgm
71+
;
72+
; GFX11-WGP-LABEL: test_s_barrier_workgroup_fence:
73+
; GFX11-WGP: ; %bb.0: ; %entry
74+
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
75+
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
76+
; GFX11-WGP-NEXT: s_barrier
77+
; GFX11-WGP-NEXT: s_endpgm
78+
;
79+
; GFX11-CU-LABEL: test_s_barrier_workgroup_fence:
80+
; GFX11-CU: ; %bb.0: ; %entry
81+
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
82+
; GFX11-CU-NEXT: s_waitcnt_depctr 0xffe3
83+
; GFX11-CU-NEXT: s_barrier
84+
; GFX11-CU-NEXT: s_endpgm
85+
;
86+
; GFX12-WGP-LABEL: test_s_barrier_workgroup_fence:
87+
; GFX12-WGP: ; %bb.0: ; %entry
88+
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
89+
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
90+
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
91+
; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
92+
; GFX12-WGP-NEXT: s_barrier_signal -1
93+
; GFX12-WGP-NEXT: s_barrier_wait -1
94+
; GFX12-WGP-NEXT: s_endpgm
95+
;
96+
; GFX12-CU-LABEL: test_s_barrier_workgroup_fence:
97+
; GFX12-CU: ; %bb.0: ; %entry
98+
; GFX12-CU-NEXT: s_wait_dscnt 0x0
99+
; GFX12-CU-NEXT: s_wait_alu 0xffe3
100+
; GFX12-CU-NEXT: s_barrier_signal -1
101+
; GFX12-CU-NEXT: s_barrier_wait -1
102+
; GFX12-CU-NEXT: s_endpgm
103+
;
104+
; GFX1250-LABEL: test_s_barrier_workgroup_fence:
105+
; GFX1250: ; %bb.0: ; %entry
106+
; GFX1250-NEXT: s_wait_dscnt 0x0
107+
; GFX1250-NEXT: s_wait_alu 0xffe3
108+
; GFX1250-NEXT: s_barrier_signal -1
109+
; GFX1250-NEXT: s_barrier_wait -1
110+
; GFX1250-NEXT: s_endpgm
111+
entry:
112+
fence syncscope("workgroup") release
113+
call void @llvm.amdgcn.s.barrier()
114+
ret void
115+
}
116+
117+
define amdgpu_kernel void @test_s_barrier_agent_fence() {
118+
; GFX10-WGP-LABEL: test_s_barrier_agent_fence:
119+
; GFX10-WGP: ; %bb.0: ; %entry
120+
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
121+
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
122+
; GFX10-WGP-NEXT: s_barrier
123+
; GFX10-WGP-NEXT: s_endpgm
124+
;
125+
; GFX10-CU-LABEL: test_s_barrier_agent_fence:
126+
; GFX10-CU: ; %bb.0: ; %entry
127+
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
128+
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
129+
; GFX10-CU-NEXT: s_waitcnt_depctr 0xffe3
130+
; GFX10-CU-NEXT: s_barrier
131+
; GFX10-CU-NEXT: s_endpgm
132+
;
133+
; GFX11-WGP-LABEL: test_s_barrier_agent_fence:
134+
; GFX11-WGP: ; %bb.0: ; %entry
135+
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
136+
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
137+
; GFX11-WGP-NEXT: s_barrier
138+
; GFX11-WGP-NEXT: s_endpgm
139+
;
140+
; GFX11-CU-LABEL: test_s_barrier_agent_fence:
141+
; GFX11-CU: ; %bb.0: ; %entry
142+
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
143+
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
144+
; GFX11-CU-NEXT: s_waitcnt_depctr 0xffe3
145+
; GFX11-CU-NEXT: s_barrier
146+
; GFX11-CU-NEXT: s_endpgm
147+
;
148+
; GFX12-WGP-LABEL: test_s_barrier_agent_fence:
149+
; GFX12-WGP: ; %bb.0: ; %entry
150+
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
151+
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
152+
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
153+
; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
154+
; GFX12-WGP-NEXT: s_barrier_signal -1
155+
; GFX12-WGP-NEXT: s_barrier_wait -1
156+
; GFX12-WGP-NEXT: s_endpgm
157+
;
158+
; GFX12-CU-LABEL: test_s_barrier_agent_fence:
159+
; GFX12-CU: ; %bb.0: ; %entry
160+
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
161+
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
162+
; GFX12-CU-NEXT: s_wait_storecnt 0x0
163+
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
164+
; GFX12-CU-NEXT: s_wait_alu 0xffe3
165+
; GFX12-CU-NEXT: s_barrier_signal -1
166+
; GFX12-CU-NEXT: s_barrier_wait -1
167+
; GFX12-CU-NEXT: s_endpgm
168+
;
169+
; GFX1250-LABEL: test_s_barrier_agent_fence:
170+
; GFX1250: ; %bb.0: ; %entry
171+
; GFX1250-NEXT: s_wait_bvhcnt 0x0
172+
; GFX1250-NEXT: s_wait_samplecnt 0x0
173+
; GFX1250-NEXT: s_wait_storecnt 0x0
174+
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
175+
; GFX1250-NEXT: s_wait_alu 0xffe3
176+
; GFX1250-NEXT: s_barrier_signal -1
177+
; GFX1250-NEXT: s_barrier_wait -1
178+
; GFX1250-NEXT: s_endpgm
179+
entry:
180+
fence syncscope("agent") release
181+
call void @llvm.amdgcn.s.barrier()
182+
ret void
183+
}

llvm/test/CodeGen/AMDGPU/waitcnt-preexisting-vscnt.mir

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ body: |
6262
; GFX11-NEXT: {{ $}}
6363
; GFX11-NEXT: S_WAITCNT 0
6464
; GFX11-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec
65-
; GFX11-NEXT: S_WAITCNT_VSCNT undef $sgpr_null, 0
65+
; GFX11-NEXT: S_WAITCNT_VSCNT undef $sgpr_null, 1
6666
; GFX11-NEXT: S_BARRIER
6767
; GFX11-NEXT: $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
6868
; GFX11-NEXT: S_WAITCNT 7
@@ -176,7 +176,7 @@ body: |
176176
; GFX11-NEXT: {{ $}}
177177
; GFX11-NEXT: S_WAITCNT 0
178178
; GFX11-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec
179-
; GFX11-NEXT: S_WAITCNT_VSCNT undef $sgpr_null, 0
179+
; GFX11-NEXT: S_WAITCNT_VSCNT undef $sgpr_null, 1
180180
; GFX11-NEXT: S_BARRIER
181181
; GFX11-NEXT: $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
182182
; GFX11-NEXT: S_WAITCNT 7

0 commit comments

Comments
 (0)