Skip to content

Commit a086fb2

Browse files
authored
[AMDGPU][gfx1250] Add wait_xcnt before any access that cannot be repeated (#168852)
The xcnt wait is actually required before any memory access that can only be done once, so atomic stores and volatile accesses are affected. This patch also ensures buffer instructions are handled.
1 parent eb568d6 commit a086fb2

18 files changed

+504
-6
lines changed

llvm/lib/Target/AMDGPU/GCNSubtarget.h

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1871,9 +1871,12 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
18711871
/// \returns true if the subtarget supports clusters of workgroups.
18721872
bool hasClusters() const { return HasClusters; }
18731873

1874-
/// \returns true if the subtarget requires a wait for xcnt before atomic
1875-
/// flat/global stores & rmw.
1876-
bool requiresWaitXCntBeforeAtomicStores() const { return GFX1250Insts; }
1874+
/// \returns true if the subtarget requires a wait for xcnt before VMEM
1875+
/// accesses that must never be repeated in the event of a page fault/re-try.
1876+
/// Atomic stores/rmw and all volatile accesses fall under this criteria.
1877+
bool requiresWaitXCntForSingleAccessInstructions() const {
1878+
return GFX1250Insts;
1879+
}
18771880

18781881
/// \returns the number of significant bits in the immediate field of the
18791882
/// S_NOP instruction.

llvm/lib/Target/AMDGPU/SIInstrInfo.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -585,6 +585,10 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
585585
return get(Opcode).TSFlags & SIInstrFlags::MTBUF;
586586
}
587587

588+
static bool isBUF(const MachineInstr &MI) {
589+
return isMUBUF(MI) || isMTBUF(MI);
590+
}
591+
588592
static bool isSMRD(const MachineInstr &MI) {
589593
return MI.getDesc().TSFlags & SIInstrFlags::SMRD;
590594
}

llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -776,6 +776,13 @@ std::optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
776776
}
777777
}
778778

779+
// FIXME: The MMO of buffer atomic instructions does not always have an atomic
780+
// ordering. We only need to handle VBUFFER atomics on GFX12+ so we can fix it
781+
// here, but the lowering should really be cleaned up at some point.
782+
if ((ST.getGeneration() >= GCNSubtarget::GFX12) && SIInstrInfo::isBUF(*MI) &&
783+
SIInstrInfo::isAtomic(*MI) && Ordering == AtomicOrdering::NotAtomic)
784+
Ordering = AtomicOrdering::Monotonic;
785+
779786
SIAtomicScope Scope = SIAtomicScope::NONE;
780787
SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
781788
bool IsCrossAddressSpaceOrdering = false;
@@ -2059,6 +2066,13 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
20592066
if (IsVolatile) {
20602067
Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SYS);
20612068

2069+
if (ST.requiresWaitXCntForSingleAccessInstructions() &&
2070+
SIInstrInfo::isVMEM(*MI)) {
2071+
MachineBasicBlock &MBB = *MI->getParent();
2072+
BuildMI(MBB, MI, MI->getDebugLoc(), TII->get(S_WAIT_XCNT_soft)).addImm(0);
2073+
Changed = true;
2074+
}
2075+
20622076
// Ensure operation has completed at system scope to cause all volatile
20632077
// operations to be visible outside the program in a global order. Do not
20642078
// request cross address space as only the global address space can be
@@ -2077,9 +2091,8 @@ bool SIGfx12CacheControl::finalizeStore(MachineInstr &MI, bool Atomic) const {
20772091
const bool IsRMW = (MI.mayLoad() && MI.mayStore());
20782092
bool Changed = false;
20792093

2080-
// GFX12.5 only: xcnt wait is needed before flat and global atomics
2081-
// stores/rmw.
2082-
if (Atomic && ST.requiresWaitXCntBeforeAtomicStores() && TII->isFLAT(MI)) {
2094+
if (Atomic && ST.requiresWaitXCntForSingleAccessInstructions() &&
2095+
SIInstrInfo::isVMEM(MI)) {
20832096
MachineBasicBlock &MBB = *MI.getParent();
20842097
BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(S_WAIT_XCNT_soft)).addImm(0);
20852098
Changed = true;

llvm/test/CodeGen/AMDGPU/bf16.ll

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5136,6 +5136,7 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
51365136
; GFX1250-NEXT: s_swap_pc_i64 s[30:31], s[0:1]
51375137
; GFX1250-NEXT: scratch_store_b16 v4, v1, off offset:4 scope:SCOPE_SYS
51385138
; GFX1250-NEXT: s_wait_storecnt 0x0
5139+
; GFX1250-NEXT: s_wait_xcnt 0x0
51395140
; GFX1250-NEXT: scratch_store_b32 v4, v0, off scope:SCOPE_SYS
51405141
; GFX1250-NEXT: s_wait_storecnt 0x0
51415142
; GFX1250-NEXT: v_readlane_b32 s31, v5, 1
@@ -6215,6 +6216,7 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
62156216
; GFX1250-NEXT: s_swap_pc_i64 s[30:31], s[0:1]
62166217
; GFX1250-NEXT: scratch_store_b128 v8, v[4:7], off offset:16 scope:SCOPE_SYS
62176218
; GFX1250-NEXT: s_wait_storecnt 0x0
6219+
; GFX1250-NEXT: s_wait_xcnt 0x0
62186220
; GFX1250-NEXT: scratch_store_b128 v8, v[0:3], off scope:SCOPE_SYS
62196221
; GFX1250-NEXT: s_wait_storecnt 0x0
62206222
; GFX1250-NEXT: v_readlane_b32 s31, v9, 1

llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7409,8 +7409,10 @@ define i32 @v_multi_use_mul_chain_add_other_use_all(i32 %arg, i32 %arg1, i32 %ar
74097409
; GFX1250-SDAG-NEXT: v_mul_lo_u32 v3, v1, v0
74107410
; GFX1250-SDAG-NEXT: global_store_b32 v[4:5], v2, off scope:SCOPE_SYS
74117411
; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0
7412+
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
74127413
; GFX1250-SDAG-NEXT: global_store_b32 v[4:5], v1, off scope:SCOPE_SYS
74137414
; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0
7415+
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
74147416
; GFX1250-SDAG-NEXT: global_store_b32 v[4:5], v3, off scope:SCOPE_SYS
74157417
; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0
74167418
; GFX1250-SDAG-NEXT: v_add_nc_u32_e32 v0, v3, v0
@@ -7431,8 +7433,10 @@ define i32 @v_multi_use_mul_chain_add_other_use_all(i32 %arg, i32 %arg1, i32 %ar
74317433
; GFX1250-GISEL-NEXT: v_mul_lo_u32 v5, v1, v0
74327434
; GFX1250-GISEL-NEXT: global_store_b32 v[2:3], v4, off scope:SCOPE_SYS
74337435
; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0
7436+
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
74347437
; GFX1250-GISEL-NEXT: global_store_b32 v[2:3], v1, off scope:SCOPE_SYS
74357438
; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0
7439+
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
74367440
; GFX1250-GISEL-NEXT: global_store_b32 v[2:3], v5, off scope:SCOPE_SYS
74377441
; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0
74387442
; GFX1250-GISEL-NEXT: v_add_nc_u32_e32 v0, v5, v0
@@ -7686,6 +7690,7 @@ define i32 @v_multi_use_mul_chain_add_other_use_some(i32 %arg, i32 %arg1, i32 %a
76867690
; GFX1250-SDAG-NEXT: v_mul_lo_u32 v3, v0, v1
76877691
; GFX1250-SDAG-NEXT: global_store_b32 v[4:5], v2, off scope:SCOPE_SYS
76887692
; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0
7693+
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
76897694
; GFX1250-SDAG-NEXT: global_store_b32 v[4:5], v3, off scope:SCOPE_SYS
76907695
; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0
76917696
; GFX1250-SDAG-NEXT: v_add_nc_u32_e32 v0, v3, v1
@@ -7706,6 +7711,7 @@ define i32 @v_multi_use_mul_chain_add_other_use_some(i32 %arg, i32 %arg1, i32 %a
77067711
; GFX1250-GISEL-NEXT: v_mul_lo_u32 v5, v0, v1
77077712
; GFX1250-GISEL-NEXT: global_store_b32 v[2:3], v4, off scope:SCOPE_SYS
77087713
; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0
7714+
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
77097715
; GFX1250-GISEL-NEXT: global_store_b32 v[2:3], v5, off scope:SCOPE_SYS
77107716
; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0
77117717
; GFX1250-GISEL-NEXT: v_add_nc_u32_e32 v0, v5, v1

0 commit comments

Comments
 (0)