Skip to content

Commit bed9be9

Browse files
authored
[AMDGPU][gfx1250] Implement SIMemoryLegalizer (#154726)
Implements the base of the MemoryLegalizer for a roughly correct GFX1250 memory model. Documentation will come later, and some remaining changes still have to be added, but this is the backbone of the model.
1 parent 4d9a7fa commit bed9be9

39 files changed

+2523
-1435
lines changed

llvm/lib/Target/AMDGPU/GCNSubtarget.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1835,6 +1835,10 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
18351835
bool hasScratchBaseForwardingHazard() const {
18361836
return GFX1250Insts && getGeneration() == GFX12;
18371837
}
1838+
1839+
/// \returns true if the subtarget requires a wait for xcnt before atomic
1840+
/// flat/global stores & rmw.
1841+
bool requiresWaitXCntBeforeAtomicStores() const { return GFX1250Insts; }
18381842
};
18391843

18401844
class GCNUserSGPRUsageInfo {

llvm/lib/Target/AMDGPU/SIInstrInfo.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1055,6 +1055,8 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
10551055
return AMDGPU::S_WAIT_DSCNT;
10561056
case AMDGPU::S_WAIT_KMCNT_soft:
10571057
return AMDGPU::S_WAIT_KMCNT;
1058+
case AMDGPU::S_WAIT_XCNT_soft:
1059+
return AMDGPU::S_WAIT_XCNT;
10581060
default:
10591061
return Opcode;
10601062
}

llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp

Lines changed: 56 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -606,7 +606,11 @@ class SIGfx12CacheControl : public SIGfx11CacheControl {
606606
SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const;
607607

608608
public:
609-
SIGfx12CacheControl(const GCNSubtarget &ST) : SIGfx11CacheControl(ST) {}
609+
SIGfx12CacheControl(const GCNSubtarget &ST) : SIGfx11CacheControl(ST) {
610+
// GFX12.0 and GFX12.5 memory models greatly overlap, and in some cases
611+
// the behavior is the same if assuming GFX12.0 in CU mode.
612+
assert(!ST.hasGFX1250Insts() || ST.isCuModeEnabled());
613+
}
610614

611615
bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
612616
SIAtomicAddrSpace AddrSpace, SIMemOp Op,
@@ -2198,7 +2202,8 @@ bool SIGfx10CacheControl::insertBarrierStart(
21982202
// mode. This is because a CU mode release fence does not emit any wait, which
21992203
// is fine when only dealing with vmem, but isn't sufficient in the presence
22002204
// of barriers which do not go through vmem.
2201-
if (!ST.isCuModeEnabled())
2205+
// GFX12.5 does not require this additional wait.
2206+
if (!ST.isCuModeEnabled() || ST.hasGFX1250Insts())
22022207
return false;
22032208

22042209
BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
@@ -2378,12 +2383,16 @@ bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI,
23782383
STORECnt |= true;
23792384
break;
23802385
case SIAtomicScope::WORKGROUP:
2381-
// In WGP mode the waves of a work-group can be executing on either CU of
2382-
// the WGP. Therefore need to wait for operations to complete to ensure
2383-
// they are visible to waves in the other CU as the L0 is per CU.
2384-
// Otherwise in CU mode and all waves of a work-group are on the same CU
2385-
// which shares the same L0.
2386-
if (!ST.isCuModeEnabled()) {
2386+
// GFX12.0:
2387+
// In WGP mode the waves of a work-group can be executing on either CU
2388+
// of the WGP. Therefore need to wait for operations to complete to
2389+
// ensure they are visible to waves in the other CU as the L0 is per CU.
2390+
// Otherwise in CU mode and all waves of a work-group are on the same CU
2391+
// which shares the same L0.
2392+
//
2393+
// GFX12.5:
2394+
// TODO DOCS
2395+
if (!ST.isCuModeEnabled() || ST.hasGFX1250Insts()) {
23872396
if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
23882397
LOADCnt |= true;
23892398
if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
@@ -2435,7 +2444,7 @@ bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI,
24352444
//
24362445
// This also applies to fences. Fences cannot pair with an instruction
24372446
// tracked with bvh/samplecnt as we don't have any atomics that do that.
2438-
if (Order != AtomicOrdering::Acquire) {
2447+
if (Order != AtomicOrdering::Acquire && ST.hasImageInsts()) {
24392448
BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_BVHCNT_soft)).addImm(0);
24402449
BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_SAMPLECNT_soft)).addImm(0);
24412450
}
@@ -2487,10 +2496,14 @@ bool SIGfx12CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
24872496
ScopeImm = AMDGPU::CPol::SCOPE_DEV;
24882497
break;
24892498
case SIAtomicScope::WORKGROUP:
2490-
// In WGP mode the waves of a work-group can be executing on either CU of
2491-
// the WGP. Therefore we need to invalidate the L0 which is per CU.
2492-
// Otherwise in CU mode all waves of a work-group are on the same CU, and so
2493-
// the L0 does not need to be invalidated.
2499+
// GFX12.0:
2500+
// In WGP mode the waves of a work-group can be executing on either CU of
2501+
// the WGP. Therefore we need to invalidate the L0 which is per CU.
2502+
// Otherwise in CU mode all waves of a work-group are on the same CU, and
2503+
// so the L0 does not need to be invalidated.
2504+
//
2505+
// GFX12.5
2506+
// TODO DOCS
24942507
if (ST.isCuModeEnabled())
24952508
return false;
24962509

@@ -2535,7 +2548,8 @@ bool SIGfx12CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
25352548
if (Pos == Position::AFTER)
25362549
++MI;
25372550

2538-
// global_wb is only necessary at system scope for gfx120x targets.
2551+
// global_wb is only necessary at system scope for GFX12.0,
2552+
// they're also necessary at device scope for GFX12.5.
25392553
//
25402554
// Emitting it for lower scopes is a slow no-op, so we omit it
25412555
// for performance.
@@ -2545,6 +2559,12 @@ bool SIGfx12CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
25452559
.addImm(AMDGPU::CPol::SCOPE_SYS);
25462560
break;
25472561
case SIAtomicScope::AGENT:
2562+
// TODO DOCS
2563+
if (ST.hasGFX1250Insts()) {
2564+
BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_WB))
2565+
.addImm(AMDGPU::CPol::SCOPE_DEV);
2566+
}
2567+
break;
25482568
case SIAtomicScope::WORKGROUP:
25492569
// No WB necessary, but we still have to wait.
25502570
break;
@@ -2607,17 +2627,32 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
26072627
}
26082628

26092629
bool SIGfx12CacheControl::finalizeStore(MachineInstr &MI, bool Atomic) const {
2610-
MachineOperand *CPol = TII->getNamedOperand(MI, OpName::cpol);
2611-
if (!CPol)
2612-
return false;
2630+
assert(MI.mayStore() && "Not a Store inst");
2631+
const bool IsRMW = (MI.mayLoad() && MI.mayStore());
2632+
bool Changed = false;
2633+
2634+
// GFX12.5 only: xcnt wait is needed before flat and global atomics
2635+
// stores/rmw.
2636+
if (Atomic && ST.requiresWaitXCntBeforeAtomicStores() && TII->isFLAT(MI)) {
2637+
MachineBasicBlock &MBB = *MI.getParent();
2638+
BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(S_WAIT_XCNT_soft)).addImm(0);
2639+
Changed = true;
2640+
}
2641+
2642+
// Remaining fixes do not apply to RMWs.
2643+
if (IsRMW)
2644+
return Changed;
26132645

2646+
MachineOperand *CPol = TII->getNamedOperand(MI, OpName::cpol);
2647+
if (!CPol) // Some vmem operations do not have a scope and are not concerned.
2648+
return Changed;
26142649
const unsigned Scope = CPol->getImm() & CPol::SCOPE;
26152650

26162651
// GFX12.0 only: Extra waits needed before system scope stores.
26172652
if (!ST.hasGFX1250Insts()) {
26182653
if (!Atomic && Scope == CPol::SCOPE_SYS)
26192654
return insertWaitsBeforeSystemScopeStore(MI);
2620-
return false;
2655+
return Changed;
26212656
}
26222657

26232658
// GFX12.5 only: Require SCOPE_SE on stores that may hit the scratch address
@@ -2627,7 +2662,7 @@ bool SIGfx12CacheControl::finalizeStore(MachineInstr &MI, bool Atomic) const {
26272662
(!ST.hasCUStores() || TII->mayAccessScratchThroughFlat(MI)))
26282663
return setScope(MI, CPol::SCOPE_SE);
26292664

2630-
return false;
2665+
return Changed;
26312666
}
26322667

26332668
bool SIGfx12CacheControl::handleCooperativeAtomic(MachineInstr &MI) const {
@@ -2839,6 +2874,7 @@ bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
28392874
assert(MI->mayLoad() && MI->mayStore());
28402875

28412876
bool Changed = false;
2877+
MachineInstr &RMWMI = *MI;
28422878

28432879
if (MOI.isAtomic()) {
28442880
const AtomicOrdering Order = MOI.getOrdering();
@@ -2873,6 +2909,7 @@ bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
28732909
Position::AFTER);
28742910
}
28752911

2912+
Changed |= CC->finalizeStore(RMWMI, /*Atomic=*/true);
28762913
return Changed;
28772914
}
28782915

llvm/lib/Target/AMDGPU/SOPInstructions.td

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1653,6 +1653,11 @@ let OtherPredicates = [HasImageInsts] in {
16531653
def S_WAIT_KMCNT_soft : SOPP_Pseudo <"s_soft_wait_kmcnt", (ins s16imm:$simm16), "$simm16">;
16541654
}
16551655

1656+
1657+
let SubtargetPredicate = HasWaitXcnt in {
1658+
def S_WAIT_XCNT_soft : SOPP_Pseudo<"", (ins s16imm:$simm16), "$simm16">;
1659+
}
1660+
16561661
// Represents the point at which a wave must wait for all outstanding direct loads to LDS.
16571662
// Typically inserted by the memory legalizer and consumed by SIInsertWaitcnts.
16581663

llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1501,6 +1501,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt
15011501
; GFX1250-NEXT: v_mul_f64_e32 v[0:1], 4.0, v[0:1]
15021502
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
15031503
; GFX1250-NEXT: s_wait_storecnt 0x0
1504+
; GFX1250-NEXT: s_wait_xcnt 0x0
15041505
; GFX1250-NEXT: s_wait_kmcnt 0x0
15051506
; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_SYS
15061507
; GFX1250-NEXT: s_wait_storecnt 0x0
@@ -1571,6 +1572,9 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace(
15711572
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
15721573
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
15731574
; GFX1250-NEXT: v_mul_f64_e32 v[0:1], 4.0, v[0:1]
1575+
; GFX1250-NEXT: global_wb scope:SCOPE_DEV
1576+
; GFX1250-NEXT: s_wait_storecnt 0x0
1577+
; GFX1250-NEXT: s_wait_xcnt 0x0
15741578
; GFX1250-NEXT: s_wait_kmcnt 0x0
15751579
; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV
15761580
; GFX1250-NEXT: s_wait_storecnt 0x0
@@ -1645,6 +1649,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace
16451649
; GFX1250-NEXT: v_mul_f64_e32 v[0:1], 4.0, v[0:1]
16461650
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
16471651
; GFX1250-NEXT: s_wait_storecnt 0x0
1652+
; GFX1250-NEXT: s_wait_xcnt 0x0
16481653
; GFX1250-NEXT: s_wait_kmcnt 0x0
16491654
; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_SYS
16501655
; GFX1250-NEXT: s_wait_storecnt 0x0
@@ -1715,6 +1720,9 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace(
17151720
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
17161721
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
17171722
; GFX1250-NEXT: v_mul_f64_e32 v[0:1], 4.0, v[0:1]
1723+
; GFX1250-NEXT: global_wb scope:SCOPE_DEV
1724+
; GFX1250-NEXT: s_wait_storecnt 0x0
1725+
; GFX1250-NEXT: s_wait_xcnt 0x0
17181726
; GFX1250-NEXT: s_wait_kmcnt 0x0
17191727
; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV
17201728
; GFX1250-NEXT: s_wait_storecnt 0x0
@@ -1792,6 +1800,7 @@ define double @global_atomic_fadd_f64_rtn_pat_agent(ptr addrspace(1) %ptr, doubl
17921800
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
17931801
; GFX1250-NEXT: s_wait_kmcnt 0x0
17941802
; GFX1250-NEXT: v_mov_b64_e32 v[2:3], 4.0
1803+
; GFX1250-NEXT: global_wb scope:SCOPE_DEV
17951804
; GFX1250-NEXT: s_wait_storecnt 0x0
17961805
; GFX1250-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
17971806
; GFX1250-NEXT: s_wait_loadcnt 0x0
@@ -1902,6 +1911,9 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrs
19021911
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
19031912
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
19041913
; GFX1250-NEXT: v_mul_f64_e32 v[0:1], 4.0, v[0:1]
1914+
; GFX1250-NEXT: global_wb scope:SCOPE_DEV
1915+
; GFX1250-NEXT: s_wait_storecnt 0x0
1916+
; GFX1250-NEXT: s_wait_xcnt 0x0
19051917
; GFX1250-NEXT: s_wait_kmcnt 0x0
19061918
; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV
19071919
; GFX1250-NEXT: s_wait_storecnt 0x0
@@ -1947,6 +1959,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat(ptr %ptr) #1 {
19471959
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
19481960
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
19491961
; GFX1250-NEXT: s_wait_storecnt 0x0
1962+
; GFX1250-NEXT: s_wait_xcnt 0x0
19501963
; GFX1250-NEXT: s_wait_kmcnt 0x0
19511964
; GFX1250-NEXT: flat_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_SYS
19521965
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
@@ -1987,6 +2000,9 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent(ptr %ptr) #1 {
19872000
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
19882001
; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 4.0
19892002
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
2003+
; GFX1250-NEXT: global_wb scope:SCOPE_DEV
2004+
; GFX1250-NEXT: s_wait_storecnt 0x0
2005+
; GFX1250-NEXT: s_wait_xcnt 0x0
19902006
; GFX1250-NEXT: s_wait_kmcnt 0x0
19912007
; GFX1250-NEXT: flat_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV
19922008
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
@@ -2031,6 +2047,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_system(ptr %ptr) #1 {
20312047
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
20322048
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
20332049
; GFX1250-NEXT: s_wait_storecnt 0x0
2050+
; GFX1250-NEXT: s_wait_xcnt 0x0
20342051
; GFX1250-NEXT: s_wait_kmcnt 0x0
20352052
; GFX1250-NEXT: flat_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_SYS
20362053
; GFX1250-NEXT: s_wait_storecnt 0x0
@@ -2107,6 +2124,7 @@ define double @flat_atomic_fadd_f64_rtn_pat_agent(ptr %ptr) #1 {
21072124
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
21082125
; GFX1250-NEXT: s_wait_kmcnt 0x0
21092126
; GFX1250-NEXT: v_mov_b64_e32 v[2:3], 4.0
2127+
; GFX1250-NEXT: global_wb scope:SCOPE_DEV
21102128
; GFX1250-NEXT: s_wait_storecnt 0x0
21112129
; GFX1250-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
21122130
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -2190,6 +2208,9 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent_safe(ptr %ptr) {
21902208
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
21912209
; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 4.0
21922210
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
2211+
; GFX1250-NEXT: global_wb scope:SCOPE_DEV
2212+
; GFX1250-NEXT: s_wait_storecnt 0x0
2213+
; GFX1250-NEXT: s_wait_xcnt 0x0
21932214
; GFX1250-NEXT: s_wait_kmcnt 0x0
21942215
; GFX1250-NEXT: flat_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV
21952216
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
@@ -2418,6 +2439,7 @@ define double @local_atomic_fadd_f64_rtn_pat(ptr addrspace(3) %ptr, double %data
24182439
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
24192440
; GFX1250-NEXT: s_wait_kmcnt 0x0
24202441
; GFX1250-NEXT: v_mov_b64_e32 v[2:3], 4.0
2442+
; GFX1250-NEXT: s_wait_storecnt 0x0
24212443
; GFX1250-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3]
24222444
; GFX1250-NEXT: s_wait_dscnt 0x0
24232445
; GFX1250-NEXT: s_set_pc_i64 s[30:31]

llvm/test/CodeGen/AMDGPU/atomics-system-scope.ll

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -364,6 +364,7 @@ define i16 @global_one_as_atomic_min_i16(ptr addrspace(1) %ptr, i16 %val) {
364364
; GFX1250-NEXT: v_lshlrev_b32_e32 v5, v3, v5
365365
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
366366
; GFX1250-NEXT: v_and_or_b32 v6, v7, v4, v5
367+
; GFX1250-NEXT: s_wait_xcnt 0x0
367368
; GFX1250-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[6:7], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
368369
; GFX1250-NEXT: s_wait_loadcnt 0x0
369370
; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
@@ -406,6 +407,7 @@ define i16 @global_one_as_atomic_umin_i16(ptr addrspace(1) %ptr, i16 %val) {
406407
; GFX1250-NEXT: v_lshlrev_b32_e32 v5, v3, v5
407408
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
408409
; GFX1250-NEXT: v_and_or_b32 v6, v7, v4, v5
410+
; GFX1250-NEXT: s_wait_xcnt 0x0
409411
; GFX1250-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[6:7], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
410412
; GFX1250-NEXT: s_wait_loadcnt 0x0
411413
; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
@@ -448,6 +450,7 @@ define i16 @global_one_as_atomic_max_i16(ptr addrspace(1) %ptr, i16 %val) {
448450
; GFX1250-NEXT: v_lshlrev_b32_e32 v5, v3, v5
449451
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
450452
; GFX1250-NEXT: v_and_or_b32 v6, v7, v4, v5
453+
; GFX1250-NEXT: s_wait_xcnt 0x0
451454
; GFX1250-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[6:7], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
452455
; GFX1250-NEXT: s_wait_loadcnt 0x0
453456
; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
@@ -490,6 +493,7 @@ define i16 @global_one_as_atomic_umax_i16(ptr addrspace(1) %ptr, i16 %val) {
490493
; GFX1250-NEXT: v_lshlrev_b32_e32 v5, v3, v5
491494
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
492495
; GFX1250-NEXT: v_and_or_b32 v6, v7, v4, v5
496+
; GFX1250-NEXT: s_wait_xcnt 0x0
493497
; GFX1250-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[6:7], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
494498
; GFX1250-NEXT: s_wait_loadcnt 0x0
495499
; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
@@ -1344,6 +1348,7 @@ define i16 @flat_one_as_atomic_min_i16(ptr %ptr, i16 %val) {
13441348
; GFX1250-NEXT: v_lshlrev_b32_e32 v5, v3, v5
13451349
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
13461350
; GFX1250-NEXT: v_and_or_b32 v6, v7, v4, v5
1351+
; GFX1250-NEXT: s_wait_xcnt 0x0
13471352
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[6:7] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
13481353
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
13491354
; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
@@ -1386,6 +1391,7 @@ define i16 @flat_one_as_atomic_umin_i16(ptr %ptr, i16 %val) {
13861391
; GFX1250-NEXT: v_lshlrev_b32_e32 v5, v3, v5
13871392
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
13881393
; GFX1250-NEXT: v_and_or_b32 v6, v7, v4, v5
1394+
; GFX1250-NEXT: s_wait_xcnt 0x0
13891395
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[6:7] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
13901396
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
13911397
; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
@@ -1428,6 +1434,7 @@ define i16 @flat_one_as_atomic_max_i16(ptr %ptr, i16 %val) {
14281434
; GFX1250-NEXT: v_lshlrev_b32_e32 v5, v3, v5
14291435
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
14301436
; GFX1250-NEXT: v_and_or_b32 v6, v7, v4, v5
1437+
; GFX1250-NEXT: s_wait_xcnt 0x0
14311438
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[6:7] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
14321439
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
14331440
; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
@@ -1470,6 +1477,7 @@ define i16 @flat_one_as_atomic_umax_i16(ptr %ptr, i16 %val) {
14701477
; GFX1250-NEXT: v_lshlrev_b32_e32 v5, v3, v5
14711478
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
14721479
; GFX1250-NEXT: v_and_or_b32 v6, v7, v4, v5
1480+
; GFX1250-NEXT: s_wait_xcnt 0x0
14731481
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[6:7] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
14741482
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
14751483
; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7

0 commit comments

Comments
 (0)