Skip to content

Commit 4676242

Browse files
authored
AMDGPU/GFX12: Do not wait unnecessarily before barriers (#154970)
The barrier intrinsic itself should not have memory semantics. Frontends should use appropriate fence instructions for memory effects, and some frontends want to rely on that for performance (e.g. wait only for LDS before a barrier). See the code comment for more detail.
1 parent 304ef65 commit 4676242

File tree

5 files changed

+30
-36
lines changed

5 files changed

+30
-36
lines changed

llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2014,11 +2014,19 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
20142014
}
20152015
}
20162016

2017-
// The subtarget may have an implicit S_WAITCNT 0 before barriers. If it does
2018-
// not, we need to ensure the subtarget is capable of backing off barrier
2019-
// instructions in case there are any outstanding memory operations that may
2020-
// cause an exception. Otherwise, insert an explicit S_WAITCNT 0 here.
2021-
if (TII->isBarrierStart(MI.getOpcode()) &&
2017+
// Ensure safety against exceptions from outstanding memory operations while
2018+
// waiting for a barrier:
2019+
//
2020+
// * Some subtargets safely handle backing off the barrier in hardware
2021+
// when an exception occurs.
2022+
// * Some subtargets have an implicit S_WAITCNT 0 before barriers, so that
2023+
// there can be no outstanding memory operations during the wait.
2024+
// * Subtargets with split barriers don't need to back off the barrier; it
2025+
// is up to the trap handler to preserve the user barrier state correctly.
2026+
//
2027+
// In all other cases, ensure safety by ensuring that there are no outstanding
2028+
// memory operations.
2029+
if (MI.getOpcode() == AMDGPU::S_BARRIER &&
20222030
!ST->hasAutoWaitcntBeforeBarrier() && !ST->supportsBackOffBarrier()) {
20232031
Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/true));
20242032
}

llvm/lib/Target/AMDGPU/SIInstrInfo.h

Lines changed: 3 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -983,19 +983,13 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
983983
return MI.getDesc().TSFlags & SIInstrFlags::IsNeverUniform;
984984
}
985985

986-
// Check to see if opcode is for a barrier start. Pre gfx12 this is just the
987-
// S_BARRIER, but after support for S_BARRIER_SIGNAL* / S_BARRIER_WAIT we want
988-
// to check for the barrier start (S_BARRIER_SIGNAL*)
989-
bool isBarrierStart(unsigned Opcode) const {
986+
bool isBarrier(unsigned Opcode) const {
990987
return Opcode == AMDGPU::S_BARRIER ||
991988
Opcode == AMDGPU::S_BARRIER_SIGNAL_M0 ||
992989
Opcode == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0 ||
993990
Opcode == AMDGPU::S_BARRIER_SIGNAL_IMM ||
994-
Opcode == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM;
995-
}
996-
997-
bool isBarrier(unsigned Opcode) const {
998-
return isBarrierStart(Opcode) || Opcode == AMDGPU::S_BARRIER_WAIT ||
991+
Opcode == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM ||
992+
Opcode == AMDGPU::S_BARRIER_WAIT ||
999993
Opcode == AMDGPU::S_BARRIER_INIT_M0 ||
1000994
Opcode == AMDGPU::S_BARRIER_INIT_IMM ||
1001995
Opcode == AMDGPU::S_BARRIER_JOIN_IMM ||

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,6 @@ define amdgpu_kernel void @test_barrier(ptr addrspace(1) %out, i32 %size) #0 {
9898
; VARIANT4-NEXT: s_wait_kmcnt 0x0
9999
; VARIANT4-NEXT: v_xad_u32 v0, v2, -1, s2
100100
; VARIANT4-NEXT: global_store_b32 v3, v2, s[0:1]
101-
; VARIANT4-NEXT: s_wait_storecnt 0x0
102101
; VARIANT4-NEXT: s_barrier_signal -1
103102
; VARIANT4-NEXT: s_barrier_wait -1
104103
; VARIANT4-NEXT: v_ashrrev_i32_e32 v1, 31, v0
@@ -145,7 +144,6 @@ define amdgpu_kernel void @test_barrier(ptr addrspace(1) %out, i32 %size) #0 {
145144
; VARIANT6-NEXT: v_sub_nc_u32_e32 v0, s2, v4
146145
; VARIANT6-NEXT: global_store_b32 v5, v4, s[0:1]
147146
; VARIANT6-NEXT: v_ashrrev_i32_e32 v1, 31, v0
148-
; VARIANT6-NEXT: s_wait_storecnt 0x0
149147
; VARIANT6-NEXT: s_barrier_signal -1
150148
; VARIANT6-NEXT: s_barrier_wait -1
151149
; VARIANT6-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.signal.isfirst.ll

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@ define i1 @func1() {
1111
; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
1212
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
1313
; GFX12-SDAG-NEXT: s_cmp_eq_u32 0, 0
14-
; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
1514
; GFX12-SDAG-NEXT: s_barrier_signal_isfirst -1
1615
; GFX12-SDAG-NEXT: s_cselect_b32 s0, -1, 0
1716
; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
@@ -27,7 +26,6 @@ define i1 @func1() {
2726
; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
2827
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
2928
; GFX12-GISEL-NEXT: s_cmp_eq_u32 0, 0
30-
; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
3129
; GFX12-GISEL-NEXT: s_barrier_signal_isfirst -1
3230
; GFX12-GISEL-NEXT: s_cselect_b32 s0, 1, 0
3331
; GFX12-GISEL-NEXT: s_wait_alu 0xfffe

llvm/test/CodeGen/AMDGPU/s-barrier.ll

Lines changed: 14 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -14,11 +14,10 @@ define void @func1() {
1414
; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
1515
; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
1616
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
17-
; GFX12-SDAG-NEXT: s_mov_b32 m0, 0x70003
18-
; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
19-
; GFX12-SDAG-NEXT: s_barrier_signal m0
2017
; GFX12-SDAG-NEXT: s_mov_b32 m0, 3
2118
; GFX12-SDAG-NEXT: s_barrier_join m0
19+
; GFX12-SDAG-NEXT: s_mov_b32 m0, 0x70003
20+
; GFX12-SDAG-NEXT: s_barrier_signal m0
2221
; GFX12-SDAG-NEXT: s_barrier_wait 1
2322
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
2423
;
@@ -30,13 +29,12 @@ define void @func1() {
3029
; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
3130
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
3231
; GFX12-GISEL-NEXT: s_mov_b32 m0, 0x70003
33-
; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
34-
; GFX12-GISEL-NEXT: s_barrier_signal m0
3532
; GFX12-GISEL-NEXT: s_barrier_join 3
33+
; GFX12-GISEL-NEXT: s_barrier_signal m0
3634
; GFX12-GISEL-NEXT: s_barrier_wait 1
3735
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
38-
call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar3, i32 7)
3936
call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar3)
37+
call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar3, i32 7)
4038
call void @llvm.amdgcn.s.barrier.wait(i16 1)
4139
ret void
4240
}
@@ -49,11 +47,10 @@ define void @func2() {
4947
; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
5048
; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
5149
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
52-
; GFX12-SDAG-NEXT: s_mov_b32 m0, 0x70001
53-
; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
54-
; GFX12-SDAG-NEXT: s_barrier_signal m0
5550
; GFX12-SDAG-NEXT: s_mov_b32 m0, 1
5651
; GFX12-SDAG-NEXT: s_barrier_join m0
52+
; GFX12-SDAG-NEXT: s_mov_b32 m0, 0x70001
53+
; GFX12-SDAG-NEXT: s_barrier_signal m0
5754
; GFX12-SDAG-NEXT: s_barrier_wait 1
5855
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
5956
;
@@ -65,13 +62,12 @@ define void @func2() {
6562
; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
6663
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
6764
; GFX12-GISEL-NEXT: s_mov_b32 m0, 0x70001
68-
; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
69-
; GFX12-GISEL-NEXT: s_barrier_signal m0
7065
; GFX12-GISEL-NEXT: s_barrier_join 1
66+
; GFX12-GISEL-NEXT: s_barrier_signal m0
7167
; GFX12-GISEL-NEXT: s_barrier_wait 1
7268
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
73-
call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar2, i32 7)
7469
call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar2)
70+
call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar2, i32 7)
7571
call void @llvm.amdgcn.s.barrier.wait(i16 1)
7672
ret void
7773
}
@@ -102,9 +98,9 @@ define amdgpu_kernel void @kernel1(ptr addrspace(1) %out, ptr addrspace(3) %in)
10298
; GFX12-SDAG-NEXT: s_barrier_signal m0
10399
; GFX12-SDAG-NEXT: s_mov_b32 m0, s2
104100
; GFX12-SDAG-NEXT: s_barrier_signal -1
105-
; GFX12-SDAG-NEXT: s_barrier_signal_isfirst -1
106101
; GFX12-SDAG-NEXT: s_barrier_join m0
107102
; GFX12-SDAG-NEXT: s_mov_b32 m0, 2
103+
; GFX12-SDAG-NEXT: s_barrier_signal_isfirst -1
108104
; GFX12-SDAG-NEXT: s_barrier_wait 1
109105
; GFX12-SDAG-NEXT: s_barrier_leave
110106
; GFX12-SDAG-NEXT: s_get_barrier_state s3, m0
@@ -155,11 +151,11 @@ define amdgpu_kernel void @kernel1(ptr addrspace(1) %out, ptr addrspace(3) %in)
155151
; GFX12-GISEL-NEXT: s_barrier_signal m0
156152
; GFX12-GISEL-NEXT: s_mov_b32 m0, s1
157153
; GFX12-GISEL-NEXT: s_barrier_signal m0
154+
; GFX12-GISEL-NEXT: s_mov_b32 m0, s0
158155
; GFX12-GISEL-NEXT: s_barrier_signal -1
156+
; GFX12-GISEL-NEXT: s_barrier_join m0
159157
; GFX12-GISEL-NEXT: s_barrier_signal_isfirst -1
160-
; GFX12-GISEL-NEXT: s_mov_b32 m0, s0
161158
; GFX12-GISEL-NEXT: s_add_co_u32 s8, s12, 48
162-
; GFX12-GISEL-NEXT: s_barrier_join m0
163159
; GFX12-GISEL-NEXT: s_barrier_wait 1
164160
; GFX12-GISEL-NEXT: s_barrier_leave
165161
; GFX12-GISEL-NEXT: s_get_barrier_state s0, 2
@@ -194,8 +190,8 @@ define amdgpu_kernel void @kernel1(ptr addrspace(1) %out, ptr addrspace(3) %in)
194190
call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar, i32 12)
195191
call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) %in, i32 9)
196192
call void @llvm.amdgcn.s.barrier.signal(i32 -1)
197-
%isfirst = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 -1)
198193
call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) %in)
194+
%isfirst = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 -1)
199195
call void @llvm.amdgcn.s.barrier.wait(i16 1)
200196
call void @llvm.amdgcn.s.barrier.leave(i16 1)
201197
%state = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3) @bar)
@@ -219,14 +215,14 @@ define amdgpu_kernel void @kernel2(ptr addrspace(1) %out, ptr addrspace(3) %in)
219215
; GFX12-SDAG-NEXT: s_load_b64 s[12:13], s[6:7], 0x0
220216
; GFX12-SDAG-NEXT: s_mov_b32 m0, 0x70002
221217
; GFX12-SDAG-NEXT: s_add_nc_u64 s[8:9], s[4:5], 48
222-
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
223218
; GFX12-SDAG-NEXT: s_barrier_signal m0
224219
; GFX12-SDAG-NEXT: s_mov_b32 m0, 2
225220
; GFX12-SDAG-NEXT: s_mov_b64 s[4:5], s[0:1]
226221
; GFX12-SDAG-NEXT: s_mov_b64 s[6:7], s[2:3]
227222
; GFX12-SDAG-NEXT: s_mov_b32 s32, 0
228223
; GFX12-SDAG-NEXT: s_barrier_join m0
229224
; GFX12-SDAG-NEXT: s_barrier_wait 1
225+
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
230226
; GFX12-SDAG-NEXT: s_swappc_b64 s[30:31], s[12:13]
231227
; GFX12-SDAG-NEXT: s_endpgm
232228
;
@@ -245,10 +241,10 @@ define amdgpu_kernel void @kernel2(ptr addrspace(1) %out, ptr addrspace(3) %in)
245241
; GFX12-GISEL-NEXT: s_mov_b64 s[4:5], s[0:1]
246242
; GFX12-GISEL-NEXT: s_mov_b64 s[6:7], s[2:3]
247243
; GFX12-GISEL-NEXT: s_mov_b32 s32, 0
248-
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
249244
; GFX12-GISEL-NEXT: s_barrier_signal m0
250245
; GFX12-GISEL-NEXT: s_barrier_join 2
251246
; GFX12-GISEL-NEXT: s_barrier_wait 1
247+
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
252248
; GFX12-GISEL-NEXT: s_swappc_b64 s[30:31], s[12:13]
253249
; GFX12-GISEL-NEXT: s_endpgm
254250
call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar, i32 7)

0 commit comments

Comments
 (0)