Skip to content

Commit 386826d

Browse files
committed
[AMDGPU][gfx1250] Also add a wait on xcnt before volatile accesses
1 parent 95d788c commit 386826d

16 files changed

+56
-5
lines changed

llvm/lib/Target/AMDGPU/GCNSubtarget.h

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1872,8 +1872,10 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
18721872
bool hasClusters() const { return HasClusters; }
18731873

18741874
/// \returns true if the subtarget requires a wait for xcnt before atomic
1875-
/// flat/global stores & rmw.
1876-
bool requiresWaitXCntBeforeAtomicStores() const { return GFX1250Insts; }
1875+
/// stores and all volatile accesses for all isFLAT operations.
1876+
bool requiresWaitXCntBeforeAtomicStoreOrVolatileAccesses() const {
1877+
return GFX1250Insts;
1878+
}
18771879

18781880
/// \returns the number of significant bits in the immediate field of the
18791881
/// S_NOP instruction.

llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2059,6 +2059,13 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
20592059
if (IsVolatile) {
20602060
Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SYS);
20612061

2062+
if (ST.requiresWaitXCntBeforeAtomicStoreOrVolatileAccesses() &&
2063+
TII->isFLAT(*MI)) {
2064+
MachineBasicBlock &MBB = *MI->getParent();
2065+
BuildMI(MBB, MI, MI->getDebugLoc(), TII->get(S_WAIT_XCNT_soft)).addImm(0);
2066+
Changed = true;
2067+
}
2068+
20622069
// Ensure operation has completed at system scope to cause all volatile
20632070
// operations to be visible outside the program in a global order. Do not
20642071
// request cross address space as only the global address space can be
@@ -2077,9 +2084,8 @@ bool SIGfx12CacheControl::finalizeStore(MachineInstr &MI, bool Atomic) const {
20772084
const bool IsRMW = (MI.mayLoad() && MI.mayStore());
20782085
bool Changed = false;
20792086

2080-
// GFX12.5 only: xcnt wait is needed before flat and global atomics
2081-
// stores/rmw.
2082-
if (Atomic && ST.requiresWaitXCntBeforeAtomicStores() && TII->isFLAT(MI)) {
2087+
if (Atomic && ST.requiresWaitXCntBeforeAtomicStoreOrVolatileAccesses() &&
2088+
TII->isFLAT(MI)) {
20832089
MachineBasicBlock &MBB = *MI.getParent();
20842090
BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(S_WAIT_XCNT_soft)).addImm(0);
20852091
Changed = true;

llvm/test/CodeGen/AMDGPU/bf16.ll

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5136,6 +5136,7 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
51365136
; GFX1250-NEXT: s_swap_pc_i64 s[30:31], s[0:1]
51375137
; GFX1250-NEXT: scratch_store_b16 v4, v1, off offset:4 scope:SCOPE_SYS
51385138
; GFX1250-NEXT: s_wait_storecnt 0x0
5139+
; GFX1250-NEXT: s_wait_xcnt 0x0
51395140
; GFX1250-NEXT: scratch_store_b32 v4, v0, off scope:SCOPE_SYS
51405141
; GFX1250-NEXT: s_wait_storecnt 0x0
51415142
; GFX1250-NEXT: v_readlane_b32 s31, v5, 1
@@ -6215,6 +6216,7 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
62156216
; GFX1250-NEXT: s_swap_pc_i64 s[30:31], s[0:1]
62166217
; GFX1250-NEXT: scratch_store_b128 v8, v[4:7], off offset:16 scope:SCOPE_SYS
62176218
; GFX1250-NEXT: s_wait_storecnt 0x0
6219+
; GFX1250-NEXT: s_wait_xcnt 0x0
62186220
; GFX1250-NEXT: scratch_store_b128 v8, v[0:3], off scope:SCOPE_SYS
62196221
; GFX1250-NEXT: s_wait_storecnt 0x0
62206222
; GFX1250-NEXT: v_readlane_b32 s31, v9, 1

llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7409,8 +7409,10 @@ define i32 @v_multi_use_mul_chain_add_other_use_all(i32 %arg, i32 %arg1, i32 %ar
74097409
; GFX1250-SDAG-NEXT: v_mul_lo_u32 v3, v1, v0
74107410
; GFX1250-SDAG-NEXT: global_store_b32 v[4:5], v2, off scope:SCOPE_SYS
74117411
; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0
7412+
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
74127413
; GFX1250-SDAG-NEXT: global_store_b32 v[4:5], v1, off scope:SCOPE_SYS
74137414
; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0
7415+
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
74147416
; GFX1250-SDAG-NEXT: global_store_b32 v[4:5], v3, off scope:SCOPE_SYS
74157417
; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0
74167418
; GFX1250-SDAG-NEXT: v_add_nc_u32_e32 v0, v3, v0
@@ -7431,8 +7433,10 @@ define i32 @v_multi_use_mul_chain_add_other_use_all(i32 %arg, i32 %arg1, i32 %ar
74317433
; GFX1250-GISEL-NEXT: v_mul_lo_u32 v5, v1, v0
74327434
; GFX1250-GISEL-NEXT: global_store_b32 v[2:3], v4, off scope:SCOPE_SYS
74337435
; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0
7436+
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
74347437
; GFX1250-GISEL-NEXT: global_store_b32 v[2:3], v1, off scope:SCOPE_SYS
74357438
; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0
7439+
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
74367440
; GFX1250-GISEL-NEXT: global_store_b32 v[2:3], v5, off scope:SCOPE_SYS
74377441
; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0
74387442
; GFX1250-GISEL-NEXT: v_add_nc_u32_e32 v0, v5, v0
@@ -7686,6 +7690,7 @@ define i32 @v_multi_use_mul_chain_add_other_use_some(i32 %arg, i32 %arg1, i32 %a
76867690
; GFX1250-SDAG-NEXT: v_mul_lo_u32 v3, v0, v1
76877691
; GFX1250-SDAG-NEXT: global_store_b32 v[4:5], v2, off scope:SCOPE_SYS
76887692
; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0
7693+
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
76897694
; GFX1250-SDAG-NEXT: global_store_b32 v[4:5], v3, off scope:SCOPE_SYS
76907695
; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0
76917696
; GFX1250-SDAG-NEXT: v_add_nc_u32_e32 v0, v3, v1
@@ -7706,6 +7711,7 @@ define i32 @v_multi_use_mul_chain_add_other_use_some(i32 %arg, i32 %arg1, i32 %a
77067711
; GFX1250-GISEL-NEXT: v_mul_lo_u32 v5, v0, v1
77077712
; GFX1250-GISEL-NEXT: global_store_b32 v[2:3], v4, off scope:SCOPE_SYS
77087713
; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0
7714+
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
77097715
; GFX1250-GISEL-NEXT: global_store_b32 v[2:3], v5, off scope:SCOPE_SYS
77107716
; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0
77117717
; GFX1250-GISEL-NEXT: v_add_nc_u32_e32 v0, v5, v1

llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,7 @@ define amdgpu_kernel void @flat_last_use_and_volatile_load(ptr %in, ptr %out) {
111111
; GFX1250-NEXT: v_mov_b32_e32 v0, 0
112112
; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
113113
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
114+
; GFX1250-NEXT: s_wait_xcnt 0x0
114115
; GFX1250-NEXT: s_wait_kmcnt 0x0
115116
; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] th:TH_LOAD_BYPASS scope:SCOPE_SYS
116117
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0

llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1303,6 +1303,7 @@ define amdgpu_kernel void @flat_nontemporal_volatile_load(
13031303
; GFX1250-NEXT: v_mov_b32_e32 v0, 0
13041304
; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
13051305
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
1306+
; GFX1250-NEXT: s_wait_xcnt 0x0
13061307
; GFX1250-NEXT: s_wait_kmcnt 0x0
13071308
; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT scope:SCOPE_SYS
13081309
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0

llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -150,6 +150,7 @@ define amdgpu_kernel void @flat_nontemporal_load_0(
150150
; GFX1250-NEXT: v_mov_b32_e32 v0, 0
151151
; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
152152
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
153+
; GFX1250-NEXT: s_wait_xcnt 0x0
153154
; GFX1250-NEXT: s_wait_kmcnt 0x0
154155
; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
155156
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -421,6 +422,7 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
421422
; GFX1250-NEXT: s_wait_xcnt 0x0
422423
; GFX1250-NEXT: s_mov_b32 s4, 0x3ff
423424
; GFX1250-NEXT: v_and_b32_e64 v1, v1, s4
425+
; GFX1250-NEXT: s_wait_xcnt 0x0
424426
; GFX1250-NEXT: s_wait_kmcnt 0x0
425427
; GFX1250-NEXT: flat_load_b32 v1, v1, s[2:3] scale_offset scope:SCOPE_SYS
426428
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -582,6 +584,7 @@ define amdgpu_kernel void @flat_nontemporal_store_0(
582584
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
583585
; GFX1250-NEXT: s_wait_kmcnt 0x0
584586
; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3]
587+
; GFX1250-NEXT: s_wait_xcnt 0x0
585588
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
586589
; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
587590
; GFX1250-NEXT: s_wait_storecnt 0x0
@@ -849,6 +852,7 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
849852
; GFX1250-NEXT: s_wait_xcnt 0x0
850853
; GFX1250-NEXT: s_mov_b32 s2, 0x3ff
851854
; GFX1250-NEXT: v_and_b32_e64 v0, v0, s2
855+
; GFX1250-NEXT: s_wait_xcnt 0x0
852856
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
853857
; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scale_offset scope:SCOPE_SYS
854858
; GFX1250-NEXT: s_wait_storecnt 0x0

llvm/test/CodeGen/AMDGPU/memory-legalizer-global-lastuse.ll

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,7 @@ define amdgpu_kernel void @global_last_use_and_volatile_load(ptr addrspace(1) %i
9292
; GFX1250-NEXT: v_mov_b32_e32 v0, 0
9393
; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
9494
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
95+
; GFX1250-NEXT: s_wait_xcnt 0x0
9596
; GFX1250-NEXT: s_wait_kmcnt 0x0
9697
; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_BYPASS scope:SCOPE_SYS
9798
; GFX1250-NEXT: s_wait_loadcnt 0x0

llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1104,6 +1104,7 @@ define amdgpu_kernel void @global_nontemporal_volatile_load(
11041104
; GFX1250-NEXT: v_mov_b32_e32 v0, 0
11051105
; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
11061106
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
1107+
; GFX1250-NEXT: s_wait_xcnt 0x0
11071108
; GFX1250-NEXT: s_wait_kmcnt 0x0
11081109
; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT scope:SCOPE_SYS
11091110
; GFX1250-NEXT: s_wait_loadcnt 0x0

llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -153,6 +153,7 @@ define amdgpu_kernel void @global_volatile_load_0(
153153
; GFX1250-NEXT: v_mov_b32_e32 v0, 0
154154
; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
155155
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
156+
; GFX1250-NEXT: s_wait_xcnt 0x0
156157
; GFX1250-NEXT: s_wait_kmcnt 0x0
157158
; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
158159
; GFX1250-NEXT: s_wait_loadcnt 0x0
@@ -361,6 +362,7 @@ define amdgpu_kernel void @global_volatile_load_1(
361362
; GFX1250-NEXT: s_wait_xcnt 0x0
362363
; GFX1250-NEXT: s_mov_b32 s4, 0x3ff
363364
; GFX1250-NEXT: v_and_b32_e64 v1, v1, s4
365+
; GFX1250-NEXT: s_wait_xcnt 0x0
364366
; GFX1250-NEXT: s_wait_kmcnt 0x0
365367
; GFX1250-NEXT: global_load_b32 v1, v1, s[2:3] scale_offset scope:SCOPE_SYS
366368
; GFX1250-NEXT: s_wait_loadcnt 0x0
@@ -532,6 +534,7 @@ define amdgpu_kernel void @global_volatile_store_0(
532534
; GFX1250-NEXT: s_load_b32 s2, s[2:3], 0x0
533535
; GFX1250-NEXT: s_wait_kmcnt 0x0
534536
; GFX1250-NEXT: v_mov_b32_e32 v1, s2
537+
; GFX1250-NEXT: s_wait_xcnt 0x0
535538
; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
536539
; GFX1250-NEXT: s_wait_storecnt 0x0
537540
; GFX1250-NEXT: s_endpgm
@@ -733,6 +736,7 @@ define amdgpu_kernel void @global_volatile_store_1(
733736
; GFX1250-NEXT: v_and_b32_e64 v0, v0, s3
734737
; GFX1250-NEXT: s_wait_kmcnt 0x0
735738
; GFX1250-NEXT: v_mov_b32_e32 v1, s2
739+
; GFX1250-NEXT: s_wait_xcnt 0x0
736740
; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scale_offset scope:SCOPE_SYS
737741
; GFX1250-NEXT: s_wait_storecnt 0x0
738742
; GFX1250-NEXT: s_endpgm

0 commit comments

Comments
 (0)