Skip to content

Commit 1bcb775

Browse files
committed
[AMDGPU][gfx1250] Add wait_xcnt before any access that cannot be repeated
All volatile accesses are concerned, and buffer operations are also concerned by this.
1 parent 274a4c0 commit 1bcb775

19 files changed

+508
-6
lines changed

llvm/lib/Target/AMDGPU/GCNSubtarget.h

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1871,9 +1871,12 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
18711871
/// \returns true if the subtarget supports clusters of workgroups.
18721872
bool hasClusters() const { return HasClusters; }
18731873

1874-
/// \returns true if the subtarget requires a wait for xcnt before atomic
1875-
/// flat/global stores & rmw.
1876-
bool requiresWaitXCntBeforeAtomicStores() const { return GFX1250Insts; }
1874+
/// \returns true if the subtarget requires a wait for xcnt before VMEM
1875+
/// accesses that must never be repeated in the event of a page fault/re-try.
1876+
/// Atomic stores/rmw and all volatile accesses fall under this criteria.
1877+
bool requiresWaitXCntForSingleAccessInstructions() const {
1878+
return GFX1250Insts;
1879+
}
18771880

18781881
/// \returns the number of significant bits in the immediate field of the
18791882
/// S_NOP instruction.

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9934,6 +9934,11 @@ void SIInstrInfo::fixImplicitOperands(MachineInstr &MI) const {
99349934
}
99359935
}
99369936

9937+
bool SIInstrInfo::isVBUFFER(const MachineInstr &MI) const {
9938+
return (ST.getGeneration() == GCNSubtarget::GFX12) &&
9939+
(MI.getDesc().TSFlags & (SIInstrFlags::MUBUF | SIInstrFlags::MTBUF));
9940+
}
9941+
99379942
bool SIInstrInfo::isBufferSMRD(const MachineInstr &MI) const {
99389943
if (!isSMRD(MI))
99399944
return false;

llvm/lib/Target/AMDGPU/SIInstrInfo.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -585,6 +585,9 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
585585
return get(Opcode).TSFlags & SIInstrFlags::MTBUF;
586586
}
587587

588+
/// \returns true if \p MI is a GFX12 VBUFFER instruction.
589+
bool isVBUFFER(const MachineInstr &MI) const;
590+
588591
static bool isSMRD(const MachineInstr &MI) {
589592
return MI.getDesc().TSFlags & SIInstrFlags::SMRD;
590593
}

llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -776,6 +776,13 @@ std::optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
776776
}
777777
}
778778

779+
// FIXME: The MMO of buffer atomic instructions does not always have an atomic
780+
// ordering. We only need to handle VBUFFER atomics on GFX12+ so we can fix it
781+
// here, but the lowering should really be cleaned up at some point.
782+
if (ST.getInstrInfo()->isVBUFFER(*MI) && SIInstrInfo::isAtomic(*MI) &&
783+
Ordering == AtomicOrdering::NotAtomic)
784+
Ordering = AtomicOrdering::Monotonic;
785+
779786
SIAtomicScope Scope = SIAtomicScope::NONE;
780787
SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
781788
bool IsCrossAddressSpaceOrdering = false;
@@ -2059,6 +2066,13 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
20592066
if (IsVolatile) {
20602067
Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SYS);
20612068

2069+
if (ST.requiresWaitXCntForSingleAccessInstructions() &&
2070+
(TII->isFLAT(*MI) || TII->isVBUFFER(*MI))) {
2071+
MachineBasicBlock &MBB = *MI->getParent();
2072+
BuildMI(MBB, MI, MI->getDebugLoc(), TII->get(S_WAIT_XCNT_soft)).addImm(0);
2073+
Changed = true;
2074+
}
2075+
20622076
// Ensure operation has completed at system scope to cause all volatile
20632077
// operations to be visible outside the program in a global order. Do not
20642078
// request cross address space as only the global address space can be
@@ -2077,9 +2091,8 @@ bool SIGfx12CacheControl::finalizeStore(MachineInstr &MI, bool Atomic) const {
20772091
const bool IsRMW = (MI.mayLoad() && MI.mayStore());
20782092
bool Changed = false;
20792093

2080-
// GFX12.5 only: xcnt wait is needed before flat and global atomics
2081-
// stores/rmw.
2082-
if (Atomic && ST.requiresWaitXCntBeforeAtomicStores() && TII->isFLAT(MI)) {
2094+
if (Atomic && ST.requiresWaitXCntForSingleAccessInstructions() &&
2095+
(TII->isFLAT(MI) || TII->isVBUFFER(MI))) {
20832096
MachineBasicBlock &MBB = *MI.getParent();
20842097
BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(S_WAIT_XCNT_soft)).addImm(0);
20852098
Changed = true;

llvm/test/CodeGen/AMDGPU/bf16.ll

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5136,6 +5136,7 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
51365136
; GFX1250-NEXT: s_swap_pc_i64 s[30:31], s[0:1]
51375137
; GFX1250-NEXT: scratch_store_b16 v4, v1, off offset:4 scope:SCOPE_SYS
51385138
; GFX1250-NEXT: s_wait_storecnt 0x0
5139+
; GFX1250-NEXT: s_wait_xcnt 0x0
51395140
; GFX1250-NEXT: scratch_store_b32 v4, v0, off scope:SCOPE_SYS
51405141
; GFX1250-NEXT: s_wait_storecnt 0x0
51415142
; GFX1250-NEXT: v_readlane_b32 s31, v5, 1
@@ -6215,6 +6216,7 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
62156216
; GFX1250-NEXT: s_swap_pc_i64 s[30:31], s[0:1]
62166217
; GFX1250-NEXT: scratch_store_b128 v8, v[4:7], off offset:16 scope:SCOPE_SYS
62176218
; GFX1250-NEXT: s_wait_storecnt 0x0
6219+
; GFX1250-NEXT: s_wait_xcnt 0x0
62186220
; GFX1250-NEXT: scratch_store_b128 v8, v[0:3], off scope:SCOPE_SYS
62196221
; GFX1250-NEXT: s_wait_storecnt 0x0
62206222
; GFX1250-NEXT: v_readlane_b32 s31, v9, 1

llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7409,8 +7409,10 @@ define i32 @v_multi_use_mul_chain_add_other_use_all(i32 %arg, i32 %arg1, i32 %ar
74097409
; GFX1250-SDAG-NEXT: v_mul_lo_u32 v3, v1, v0
74107410
; GFX1250-SDAG-NEXT: global_store_b32 v[4:5], v2, off scope:SCOPE_SYS
74117411
; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0
7412+
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
74127413
; GFX1250-SDAG-NEXT: global_store_b32 v[4:5], v1, off scope:SCOPE_SYS
74137414
; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0
7415+
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
74147416
; GFX1250-SDAG-NEXT: global_store_b32 v[4:5], v3, off scope:SCOPE_SYS
74157417
; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0
74167418
; GFX1250-SDAG-NEXT: v_add_nc_u32_e32 v0, v3, v0
@@ -7431,8 +7433,10 @@ define i32 @v_multi_use_mul_chain_add_other_use_all(i32 %arg, i32 %arg1, i32 %ar
74317433
; GFX1250-GISEL-NEXT: v_mul_lo_u32 v5, v1, v0
74327434
; GFX1250-GISEL-NEXT: global_store_b32 v[2:3], v4, off scope:SCOPE_SYS
74337435
; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0
7436+
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
74347437
; GFX1250-GISEL-NEXT: global_store_b32 v[2:3], v1, off scope:SCOPE_SYS
74357438
; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0
7439+
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
74367440
; GFX1250-GISEL-NEXT: global_store_b32 v[2:3], v5, off scope:SCOPE_SYS
74377441
; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0
74387442
; GFX1250-GISEL-NEXT: v_add_nc_u32_e32 v0, v5, v0
@@ -7686,6 +7690,7 @@ define i32 @v_multi_use_mul_chain_add_other_use_some(i32 %arg, i32 %arg1, i32 %a
76867690
; GFX1250-SDAG-NEXT: v_mul_lo_u32 v3, v0, v1
76877691
; GFX1250-SDAG-NEXT: global_store_b32 v[4:5], v2, off scope:SCOPE_SYS
76887692
; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0
7693+
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
76897694
; GFX1250-SDAG-NEXT: global_store_b32 v[4:5], v3, off scope:SCOPE_SYS
76907695
; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0
76917696
; GFX1250-SDAG-NEXT: v_add_nc_u32_e32 v0, v3, v1
@@ -7706,6 +7711,7 @@ define i32 @v_multi_use_mul_chain_add_other_use_some(i32 %arg, i32 %arg1, i32 %a
77067711
; GFX1250-GISEL-NEXT: v_mul_lo_u32 v5, v0, v1
77077712
; GFX1250-GISEL-NEXT: global_store_b32 v[2:3], v4, off scope:SCOPE_SYS
77087713
; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0
7714+
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
77097715
; GFX1250-GISEL-NEXT: global_store_b32 v[2:3], v5, off scope:SCOPE_SYS
77107716
; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0
77117717
; GFX1250-GISEL-NEXT: v_add_nc_u32_e32 v0, v5, v1

0 commit comments

Comments
 (0)