Skip to content

Commit 716b8cc

Browse files
committed
[AMDGPU][gfx1250] Implement SIMemoryLegalizer
Implements the base of the MemoryLegalizer for a roughly correct GFX1250 memory model. Documentation will come later, and some remaining changes still have to be added, but this is the backbone of the model.
1 parent ba265d8 commit 716b8cc

30 files changed

+1792
-1012
lines changed

llvm/lib/Target/AMDGPU/GCNSubtarget.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1831,6 +1831,10 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
18311831
bool hasScratchBaseForwardingHazard() const {
18321832
return GFX1250Insts && getGeneration() == GFX12;
18331833
}
1834+
1835+
/// \returns true if the subtarget requires a wait for xcnt before atomic
1836+
/// flat/global stores & rmw.
1837+
bool requiresWaitXCntBeforeAtomicStores() const { return GFX1250Insts; }
18341838
};
18351839

18361840
class GCNUserSGPRUsageInfo {

llvm/lib/Target/AMDGPU/SIInstrInfo.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1051,6 +1051,8 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
10511051
return AMDGPU::S_WAIT_DSCNT;
10521052
case AMDGPU::S_WAIT_KMCNT_soft:
10531053
return AMDGPU::S_WAIT_KMCNT;
1054+
case AMDGPU::S_WAIT_XCNT_soft:
1055+
return AMDGPU::S_WAIT_XCNT;
10541056
default:
10551057
return Opcode;
10561058
}

llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp

Lines changed: 54 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -587,7 +587,11 @@ class SIGfx12CacheControl : public SIGfx11CacheControl {
587587
SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const;
588588

589589
public:
590-
SIGfx12CacheControl(const GCNSubtarget &ST) : SIGfx11CacheControl(ST) {}
590+
SIGfx12CacheControl(const GCNSubtarget &ST) : SIGfx11CacheControl(ST) {
591+
// GFX12.0 and GFX12.5 memory models greatly overlap, and in some cases
592+
// the behavior is the same if assuming GFX12.0 in CU mode.
593+
assert(ST.hasGFX1250Insts() ? ST.isCuModeEnabled() : true);
594+
}
591595

592596
bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
593597
SIAtomicAddrSpace AddrSpace, SIMemOp Op,
@@ -2340,12 +2344,16 @@ bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI,
23402344
STORECnt |= true;
23412345
break;
23422346
case SIAtomicScope::WORKGROUP:
2343-
// In WGP mode the waves of a work-group can be executing on either CU of
2344-
// the WGP. Therefore need to wait for operations to complete to ensure
2345-
// they are visible to waves in the other CU as the L0 is per CU.
2346-
// Otherwise in CU mode and all waves of a work-group are on the same CU
2347-
// which shares the same L0.
2348-
if (!ST.isCuModeEnabled()) {
2347+
// GFX12.0:
2348+
// In WGP mode the waves of a work-group can be executing on either CU
2349+
// of the WGP. Therefore need to wait for operations to complete to
2350+
// ensure they are visible to waves in the other CU as the L0 is per CU.
2351+
// Otherwise in CU mode and all waves of a work-group are on the same CU
2352+
// which shares the same L0.
2353+
//
2354+
// GFX12.5:
2355+
// TODO DOCS
2356+
if (!ST.isCuModeEnabled() || ST.hasGFX1250Insts()) {
23492357
if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
23502358
LOADCnt |= true;
23512359
if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
@@ -2366,7 +2374,7 @@ bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI,
23662374
switch (Scope) {
23672375
case SIAtomicScope::SYSTEM:
23682376
case SIAtomicScope::AGENT:
2369-
case SIAtomicScope::WORKGROUP:
2377+
case SIAtomicScope::WORKGROUP:
23702378
// If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
23712379
// not needed as LDS operations for all waves are executed in a total
23722380
// global ordering as observed by all waves. Required if also
@@ -2397,7 +2405,7 @@ bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI,
23972405
//
23982406
// This also applies to fences. Fences cannot pair with an instruction
23992407
// tracked with bvh/samplecnt as we don't have any atomics that do that.
2400-
if (Order != AtomicOrdering::Acquire) {
2408+
if (Order != AtomicOrdering::Acquire && ST.hasImageInsts()) {
24012409
BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_BVHCNT_soft)).addImm(0);
24022410
BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_SAMPLECNT_soft)).addImm(0);
24032411
}
@@ -2449,10 +2457,14 @@ bool SIGfx12CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
24492457
ScopeImm = AMDGPU::CPol::SCOPE_DEV;
24502458
break;
24512459
case SIAtomicScope::WORKGROUP:
2452-
// In WGP mode the waves of a work-group can be executing on either CU of
2453-
// the WGP. Therefore we need to invalidate the L0 which is per CU.
2454-
// Otherwise in CU mode all waves of a work-group are on the same CU, and so
2455-
// the L0 does not need to be invalidated.
2460+
// GFX12.0:
2461+
// In WGP mode the waves of a work-group can be executing on either CU of
2462+
// the WGP. Therefore we need to invalidate the L0 which is per CU.
2463+
// Otherwise in CU mode all waves of a work-group are on the same CU, and
2464+
// so the L0 does not need to be invalidated.
2465+
//
2466+
// GFX12.5
2467+
// TODO DOCS
24562468
if (ST.isCuModeEnabled())
24572469
return false;
24582470

@@ -2497,7 +2509,8 @@ bool SIGfx12CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
24972509
if (Pos == Position::AFTER)
24982510
++MI;
24992511

2500-
// global_wb is only necessary at system scope for gfx120x targets.
2512+
// global_wb is only necessary at system scope for GFX12.0,
2513+
// they're also necessary at device scope for GFX12.5.
25012514
//
25022515
// Emitting it for lower scopes is a slow no-op, so we omit it
25032516
// for performance.
@@ -2507,6 +2520,12 @@ bool SIGfx12CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
25072520
.addImm(AMDGPU::CPol::SCOPE_SYS);
25082521
break;
25092522
case SIAtomicScope::AGENT:
2523+
// TODO DOCS
2524+
if (ST.hasGFX1250Insts()) {
2525+
BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_WB))
2526+
.addImm(AMDGPU::CPol::SCOPE_DEV);
2527+
}
2528+
break;
25102529
case SIAtomicScope::WORKGROUP:
25112530
// No WB necessary, but we still have to wait.
25122531
break;
@@ -2569,17 +2588,31 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
25692588
}
25702589

25712590
bool SIGfx12CacheControl::finalizeStore(MachineInstr &MI, bool Atomic) const {
2572-
MachineOperand *CPol = TII->getNamedOperand(MI, OpName::cpol);
2573-
if (!CPol)
2574-
return false;
2591+
assert(MI.mayStore() && "Not a Store inst");
2592+
const bool IsRMW = (MI.mayLoad() && MI.mayStore());
2593+
bool Changed = false;
2594+
2595+
// GFX12.5 only: xcnt wait is needed before flat and global atomics stores/rmw
2596+
if (Atomic && ST.requiresWaitXCntBeforeAtomicStores() && TII->isFLAT(MI)) {
2597+
MachineBasicBlock &MBB = *MI.getParent();
2598+
BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(S_WAIT_XCNT_soft)).addImm(0);
2599+
Changed = true;
2600+
}
2601+
2602+
// Remaining fixes do not apply to RMWs
2603+
if (IsRMW)
2604+
return Changed;
25752605

2606+
MachineOperand *CPol = TII->getNamedOperand(MI, OpName::cpol);
2607+
if (!CPol) // Some vmem operations do not have a scope and are not concerned.
2608+
return Changed;
25762609
const unsigned Scope = CPol->getImm() & CPol::SCOPE;
25772610

25782611
// GFX12.0 only: Extra waits needed before system scope stores.
25792612
if (!ST.hasGFX1250Insts()) {
25802613
if (!Atomic && Scope == CPol::SCOPE_SYS)
25812614
return insertWaitsBeforeSystemScopeStore(MI);
2582-
return false;
2615+
return Changed;
25832616
}
25842617

25852618
// GFX12.5 only: Require SCOPE_SE on stores that may hit the scratch address
@@ -2589,7 +2622,7 @@ bool SIGfx12CacheControl::finalizeStore(MachineInstr &MI, bool Atomic) const {
25892622
(!ST.hasCUStores() || TII->mayAccessScratchThroughFlat(MI)))
25902623
return setScope(MI, CPol::SCOPE_SE);
25912624

2592-
return false;
2625+
return Changed;
25932626
}
25942627

25952628
bool SIGfx12CacheControl::setAtomicScope(const MachineBasicBlock::iterator &MI,
@@ -2778,6 +2811,7 @@ bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
27782811
assert(MI->mayLoad() && MI->mayStore());
27792812

27802813
bool Changed = false;
2814+
MachineInstr &RMWMI = *MI;
27812815

27822816
if (MOI.isAtomic()) {
27832817
const AtomicOrdering Order = MOI.getOrdering();
@@ -2812,6 +2846,7 @@ bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
28122846
Position::AFTER);
28132847
}
28142848

2849+
Changed |= CC->finalizeStore(RMWMI, /*Atomic=*/true);
28152850
return Changed;
28162851
}
28172852

llvm/lib/Target/AMDGPU/SOPInstructions.td

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1656,6 +1656,11 @@ let OtherPredicates = [HasImageInsts] in {
16561656
def S_WAIT_KMCNT_soft : SOPP_Pseudo <"s_soft_wait_kmcnt", (ins s16imm:$simm16), "$simm16">;
16571657
}
16581658

1659+
1660+
let SubtargetPredicate = HasWaitXcnt in {
1661+
def S_WAIT_XCNT_soft : SOPP_Pseudo<"s_soft_wait_xcnt", (ins s16imm:$simm16), "$simm16">;
1662+
}
1663+
16591664
// Represents the point at which a wave must wait for all outstanding direct loads to LDS.
16601665
// Typically inserted by the memory legalizer and consumed by SIInsertWaitcnts.
16611666

llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1501,6 +1501,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt
15011501
; GFX1250-NEXT: v_mul_f64_e32 v[0:1], 4.0, v[0:1]
15021502
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
15031503
; GFX1250-NEXT: s_wait_storecnt 0x0
1504+
; GFX1250-NEXT: s_wait_xcnt 0x0
15041505
; GFX1250-NEXT: s_wait_kmcnt 0x0
15051506
; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_SYS
15061507
; GFX1250-NEXT: s_wait_storecnt 0x0
@@ -1571,6 +1572,9 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace(
15711572
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
15721573
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
15731574
; GFX1250-NEXT: v_mul_f64_e32 v[0:1], 4.0, v[0:1]
1575+
; GFX1250-NEXT: global_wb scope:SCOPE_DEV
1576+
; GFX1250-NEXT: s_wait_storecnt 0x0
1577+
; GFX1250-NEXT: s_wait_xcnt 0x0
15741578
; GFX1250-NEXT: s_wait_kmcnt 0x0
15751579
; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV
15761580
; GFX1250-NEXT: s_wait_storecnt 0x0
@@ -1645,6 +1649,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace
16451649
; GFX1250-NEXT: v_mul_f64_e32 v[0:1], 4.0, v[0:1]
16461650
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
16471651
; GFX1250-NEXT: s_wait_storecnt 0x0
1652+
; GFX1250-NEXT: s_wait_xcnt 0x0
16481653
; GFX1250-NEXT: s_wait_kmcnt 0x0
16491654
; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_SYS
16501655
; GFX1250-NEXT: s_wait_storecnt 0x0
@@ -1715,6 +1720,9 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace(
17151720
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
17161721
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
17171722
; GFX1250-NEXT: v_mul_f64_e32 v[0:1], 4.0, v[0:1]
1723+
; GFX1250-NEXT: global_wb scope:SCOPE_DEV
1724+
; GFX1250-NEXT: s_wait_storecnt 0x0
1725+
; GFX1250-NEXT: s_wait_xcnt 0x0
17181726
; GFX1250-NEXT: s_wait_kmcnt 0x0
17191727
; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV
17201728
; GFX1250-NEXT: s_wait_storecnt 0x0
@@ -1792,6 +1800,7 @@ define double @global_atomic_fadd_f64_rtn_pat_agent(ptr addrspace(1) %ptr, doubl
17921800
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
17931801
; GFX1250-NEXT: s_wait_kmcnt 0x0
17941802
; GFX1250-NEXT: v_mov_b64_e32 v[2:3], 4.0
1803+
; GFX1250-NEXT: global_wb scope:SCOPE_DEV
17951804
; GFX1250-NEXT: s_wait_storecnt 0x0
17961805
; GFX1250-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
17971806
; GFX1250-NEXT: s_wait_loadcnt 0x0
@@ -1902,6 +1911,9 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrs
19021911
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
19031912
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
19041913
; GFX1250-NEXT: v_mul_f64_e32 v[0:1], 4.0, v[0:1]
1914+
; GFX1250-NEXT: global_wb scope:SCOPE_DEV
1915+
; GFX1250-NEXT: s_wait_storecnt 0x0
1916+
; GFX1250-NEXT: s_wait_xcnt 0x0
19051917
; GFX1250-NEXT: s_wait_kmcnt 0x0
19061918
; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV
19071919
; GFX1250-NEXT: s_wait_storecnt 0x0
@@ -1947,6 +1959,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat(ptr %ptr) #1 {
19471959
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
19481960
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
19491961
; GFX1250-NEXT: s_wait_storecnt 0x0
1962+
; GFX1250-NEXT: s_wait_xcnt 0x0
19501963
; GFX1250-NEXT: s_wait_kmcnt 0x0
19511964
; GFX1250-NEXT: flat_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_SYS
19521965
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
@@ -1987,6 +2000,9 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent(ptr %ptr) #1 {
19872000
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
19882001
; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 4.0
19892002
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
2003+
; GFX1250-NEXT: global_wb scope:SCOPE_DEV
2004+
; GFX1250-NEXT: s_wait_storecnt 0x0
2005+
; GFX1250-NEXT: s_wait_xcnt 0x0
19902006
; GFX1250-NEXT: s_wait_kmcnt 0x0
19912007
; GFX1250-NEXT: flat_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV
19922008
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
@@ -2031,6 +2047,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_system(ptr %ptr) #1 {
20312047
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
20322048
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
20332049
; GFX1250-NEXT: s_wait_storecnt 0x0
2050+
; GFX1250-NEXT: s_wait_xcnt 0x0
20342051
; GFX1250-NEXT: s_wait_kmcnt 0x0
20352052
; GFX1250-NEXT: flat_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_SYS
20362053
; GFX1250-NEXT: s_wait_storecnt 0x0
@@ -2107,6 +2124,7 @@ define double @flat_atomic_fadd_f64_rtn_pat_agent(ptr %ptr) #1 {
21072124
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
21082125
; GFX1250-NEXT: s_wait_kmcnt 0x0
21092126
; GFX1250-NEXT: v_mov_b64_e32 v[2:3], 4.0
2127+
; GFX1250-NEXT: global_wb scope:SCOPE_DEV
21102128
; GFX1250-NEXT: s_wait_storecnt 0x0
21112129
; GFX1250-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
21122130
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -2190,6 +2208,9 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent_safe(ptr %ptr) {
21902208
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
21912209
; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 4.0
21922210
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
2211+
; GFX1250-NEXT: global_wb scope:SCOPE_DEV
2212+
; GFX1250-NEXT: s_wait_storecnt 0x0
2213+
; GFX1250-NEXT: s_wait_xcnt 0x0
21932214
; GFX1250-NEXT: s_wait_kmcnt 0x0
21942215
; GFX1250-NEXT: flat_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV
21952216
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
@@ -2418,6 +2439,7 @@ define double @local_atomic_fadd_f64_rtn_pat(ptr addrspace(3) %ptr, double %data
24182439
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
24192440
; GFX1250-NEXT: s_wait_kmcnt 0x0
24202441
; GFX1250-NEXT: v_mov_b64_e32 v[2:3], 4.0
2442+
; GFX1250-NEXT: s_wait_storecnt 0x0
24212443
; GFX1250-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3]
24222444
; GFX1250-NEXT: s_wait_dscnt 0x0
24232445
; GFX1250-NEXT: s_set_pc_i64 s[30:31]

llvm/test/CodeGen/AMDGPU/atomics-system-scope.ll

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -364,6 +364,7 @@ define i16 @global_one_as_atomic_min_i16(ptr addrspace(1) %ptr, i16 %val) {
364364
; GFX1250-NEXT: v_lshlrev_b32_e32 v5, v3, v5
365365
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
366366
; GFX1250-NEXT: v_and_or_b32 v6, v7, v4, v5
367+
; GFX1250-NEXT: s_wait_xcnt 0x0
367368
; GFX1250-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[6:7], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
368369
; GFX1250-NEXT: s_wait_loadcnt 0x0
369370
; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
@@ -406,6 +407,7 @@ define i16 @global_one_as_atomic_umin_i16(ptr addrspace(1) %ptr, i16 %val) {
406407
; GFX1250-NEXT: v_lshlrev_b32_e32 v5, v3, v5
407408
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
408409
; GFX1250-NEXT: v_and_or_b32 v6, v7, v4, v5
410+
; GFX1250-NEXT: s_wait_xcnt 0x0
409411
; GFX1250-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[6:7], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
410412
; GFX1250-NEXT: s_wait_loadcnt 0x0
411413
; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
@@ -448,6 +450,7 @@ define i16 @global_one_as_atomic_max_i16(ptr addrspace(1) %ptr, i16 %val) {
448450
; GFX1250-NEXT: v_lshlrev_b32_e32 v5, v3, v5
449451
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
450452
; GFX1250-NEXT: v_and_or_b32 v6, v7, v4, v5
453+
; GFX1250-NEXT: s_wait_xcnt 0x0
451454
; GFX1250-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[6:7], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
452455
; GFX1250-NEXT: s_wait_loadcnt 0x0
453456
; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
@@ -490,6 +493,7 @@ define i16 @global_one_as_atomic_umax_i16(ptr addrspace(1) %ptr, i16 %val) {
490493
; GFX1250-NEXT: v_lshlrev_b32_e32 v5, v3, v5
491494
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
492495
; GFX1250-NEXT: v_and_or_b32 v6, v7, v4, v5
496+
; GFX1250-NEXT: s_wait_xcnt 0x0
493497
; GFX1250-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[6:7], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
494498
; GFX1250-NEXT: s_wait_loadcnt 0x0
495499
; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
@@ -1344,6 +1348,7 @@ define i16 @flat_one_as_atomic_min_i16(ptr %ptr, i16 %val) {
13441348
; GFX1250-NEXT: v_lshlrev_b32_e32 v5, v3, v5
13451349
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
13461350
; GFX1250-NEXT: v_and_or_b32 v6, v7, v4, v5
1351+
; GFX1250-NEXT: s_wait_xcnt 0x0
13471352
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[6:7] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
13481353
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
13491354
; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
@@ -1386,6 +1391,7 @@ define i16 @flat_one_as_atomic_umin_i16(ptr %ptr, i16 %val) {
13861391
; GFX1250-NEXT: v_lshlrev_b32_e32 v5, v3, v5
13871392
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
13881393
; GFX1250-NEXT: v_and_or_b32 v6, v7, v4, v5
1394+
; GFX1250-NEXT: s_wait_xcnt 0x0
13891395
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[6:7] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
13901396
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
13911397
; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
@@ -1428,6 +1434,7 @@ define i16 @flat_one_as_atomic_max_i16(ptr %ptr, i16 %val) {
14281434
; GFX1250-NEXT: v_lshlrev_b32_e32 v5, v3, v5
14291435
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
14301436
; GFX1250-NEXT: v_and_or_b32 v6, v7, v4, v5
1437+
; GFX1250-NEXT: s_wait_xcnt 0x0
14311438
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[6:7] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
14321439
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
14331440
; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
@@ -1470,6 +1477,7 @@ define i16 @flat_one_as_atomic_umax_i16(ptr %ptr, i16 %val) {
14701477
; GFX1250-NEXT: v_lshlrev_b32_e32 v5, v3, v5
14711478
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
14721479
; GFX1250-NEXT: v_and_or_b32 v6, v7, v4, v5
1480+
; GFX1250-NEXT: s_wait_xcnt 0x0
14731481
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[6:7] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
14741482
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
14751483
; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7

0 commit comments

Comments
 (0)