Skip to content

Commit 12a3fe2

Browse files
authored
[AMDGPU][gfx1250] Add a wait on xcnt before atomic stores (#3151)
SW Fix for DEGFXMI400-11186 We need to wait on xcnt before atomic stores/RMWs on MI400 to prevent them from being re-run. This was a bit awkward to implement because of SWDEV-544018; I had to change `finalizeStore` + its callers so that it's always called with the right MachineInstr. Depends on #3150
1 parent 283d7dc commit 12a3fe2

29 files changed

+1616
-28
lines changed

llvm/lib/Target/AMDGPU/SIInstrInfo.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1057,6 +1057,8 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
10571057
return AMDGPU::S_WAIT_DSCNT;
10581058
case AMDGPU::S_WAIT_KMCNT_soft:
10591059
return AMDGPU::S_WAIT_KMCNT;
1060+
case AMDGPU::S_WAIT_XCNT_soft:
1061+
return AMDGPU::S_WAIT_XCNT;
10601062
default:
10611063
return Opcode;
10621064
}

llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp

Lines changed: 39 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -335,8 +335,12 @@ class SICacheControl {
335335
bool IsNonTemporal,
336336
bool IsLastUse = false) const = 0;
337337

338-
virtual bool finalizeStore(MachineBasicBlock::iterator &MI,
339-
bool Atomic) const {
338+
/// Add final touches to a `mayStore` instruction \p MI, which may be a
339+
/// Store or RMW instruction.
340+
/// FIXME: This takes a MI because iterators aren't handled properly. When
341+
/// this is called, they often point to entirely different insts. Thus we back
342+
/// up the inst early and pass it here instead.
343+
virtual bool finalizeStore(MachineInstr &MI, bool Atomic) const {
340344
return false;
341345
};
342346

@@ -627,8 +631,7 @@ class SIGfx12CacheControl : public SIGfx11CacheControl {
627631
bool IsVolatile, bool IsNonTemporal,
628632
bool IsLastUse) const override;
629633

630-
bool finalizeStore(MachineBasicBlock::iterator &MI,
631-
bool Atomic) const override;
634+
bool finalizeStore(MachineInstr &MI, bool Atomic) const override;
632635

633636
virtual bool handleCooperativeAtomic(MachineInstr &MI) const override;
634637

@@ -2594,10 +2597,6 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
25942597
if (IsVolatile) {
25952598
Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SYS);
25962599

2597-
if (Op == SIMemOp::STORE && !ST.hasGFX1250Insts() &&
2598-
TII->getNamedOperand(*MI, OpName::cpol))
2599-
Changed |= insertWaitsBeforeSystemScopeStore(MI);
2600-
26012600
// Ensure operation has completed at system scope to cause all volatile
26022601
// operations to be visible outside the program in a global order. Do not
26032602
// request cross address space as only the global address space can be
@@ -2610,32 +2609,41 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
26102609
return Changed;
26112610
}
26122611

2613-
bool SIGfx12CacheControl::finalizeStore(MachineBasicBlock::iterator &MI,
2614-
bool Atomic) const {
2615-
// GFX120x specific: we must add waits before a system scope store.
2616-
MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol);
2617-
if (!CPol)
2618-
return false;
2612+
bool SIGfx12CacheControl::finalizeStore(MachineInstr &MI, bool Atomic) const {
2613+
assert(MI.mayStore() && "Not a Store inst");
2614+
const bool IsRMW = (MI.mayLoad() && MI.mayStore());
2615+
bool Changed = false;
2616+
2617+
// GFX125x only: xcnt wait is needed before atomics stores/rmw
2618+
if (Atomic && ST.hasGFX1250Insts()) {
2619+
// TODO: add isAtomic once I figured out this bug.
2620+
MachineBasicBlock &MBB = *MI.getParent();
2621+
BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(S_WAIT_XCNT_soft)).addImm(0);
2622+
Changed = true;
2623+
}
2624+
2625+
// Remaining fixes do not apply to RMWs
2626+
if (IsRMW)
2627+
return Changed;
26192628

2620-
// No scope operand means SCOPE_CU.
2629+
MachineOperand *CPol = TII->getNamedOperand(MI, OpName::cpol);
2630+
if (!CPol) // Some vmem operations do not have a scope and are not concerned.
2631+
return Changed;
26212632
const unsigned Scope = CPol->getImm() & CPol::SCOPE;
26222633

2623-
// GFX120x only: Extra waits needed before system scope stores.
2634+
// GFX120x only: Extra waits needed before non-atomic system scope stores.
26242635
if (!ST.hasGFX1250Insts()) {
26252636
if (!Atomic && Scope == CPol::SCOPE_SYS)
2626-
return insertWaitsBeforeSystemScopeStore(MI);
2627-
return false;
2637+
Changed |= insertWaitsBeforeSystemScopeStore(MI.getIterator());
2638+
return Changed;
26282639
}
26292640

2630-
// GFX1250 only: Require SCOPE_SE on stores that may hit the scratch address
2641+
// GFX125x only: Require SCOPE_SE on stores that may hit the scratch address
26312642
// space, or if the "cu-stores" target feature is disabled.
2632-
if (Scope != CPol::SCOPE_CU)
2633-
return false;
2634-
2635-
if (!ST.hasCUStores() || TII->mayAccessScratchThroughFlat(*MI))
2636-
return setScope(MI, CPol::SCOPE_SE);
2637-
2638-
return false;
2643+
if (Scope == CPol::SCOPE_CU &&
2644+
(!ST.hasCUStores() || TII->mayAccessScratchThroughFlat(MI)))
2645+
Changed |= setScope(MI, CPol::SCOPE_SE);
2646+
return Changed;
26392647
}
26402648

26412649
bool SIGfx12CacheControl::handleCooperativeAtomic(MachineInstr &MI) const {
@@ -2757,6 +2765,7 @@ bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
27572765
assert(!MI->mayLoad() && MI->mayStore());
27582766

27592767
bool Changed = false;
2768+
MachineInstr &StoreMI = *MI;
27602769

27612770
if (MOI.isAtomic()) {
27622771
if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
@@ -2778,7 +2787,7 @@ bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
27782787
MOI.getIsCrossAddressSpaceOrdering(),
27792788
Position::BEFORE);
27802789

2781-
Changed |= CC->finalizeStore(MI, /*Atomic=*/true);
2790+
Changed |= CC->finalizeStore(StoreMI, /*Atomic=*/true);
27822791
return Changed;
27832792
}
27842793

@@ -2789,7 +2798,7 @@ bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
27892798
MI, MOI.getInstrAddrSpace(), SIMemOp::STORE, MOI.isVolatile(),
27902799
MOI.isNonTemporal());
27912800

2792-
Changed |= CC->finalizeStore(MI, /*Atomic=*/false);
2801+
Changed |= CC->finalizeStore(StoreMI, /*Atomic=*/false);
27932802
return Changed;
27942803
}
27952804

@@ -2850,6 +2859,7 @@ bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
28502859
assert(MI->mayLoad() && MI->mayStore());
28512860

28522861
bool Changed = false;
2862+
MachineInstr &RMWMI = *MI;
28532863

28542864
if (MOI.isAtomic()) {
28552865
const AtomicOrdering Order = MOI.getOrdering();
@@ -2884,6 +2894,7 @@ bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
28842894
Position::AFTER);
28852895
}
28862896

2897+
Changed |= CC->finalizeStore(RMWMI, /*Atomic=*/true);
28872898
return Changed;
28882899
}
28892900

llvm/lib/Target/AMDGPU/SOPInstructions.td

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1663,6 +1663,10 @@ let OtherPredicates = [HasImageInsts] in {
16631663
def S_WAIT_KMCNT_soft : SOPP_Pseudo <"s_soft_wait_kmcnt", (ins s16imm:$simm16), "$simm16">;
16641664
}
16651665

1666+
let SubtargetPredicate = HasWaitXcnt in {
1667+
def S_WAIT_XCNT_soft : SOPP_Pseudo<"s_soft_wait_xcnt", (ins s16imm:$simm16), "$simm16">;
1668+
}
1669+
16661670
def S_SETHALT : SOPP_Pseudo <"s_sethalt" , (ins i32imm:$simm16), "$simm16",
16671671
[(int_amdgcn_s_sethalt timm:$simm16)]>;
16681672
def S_SETKILL : SOPP_Pseudo <"s_setkill" , (ins i16imm:$simm16), "$simm16"> {

llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1501,6 +1501,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt
15011501
; GFX1250-NEXT: v_mul_f64_e32 v[0:1], 4.0, v[0:1]
15021502
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
15031503
; GFX1250-NEXT: s_wait_storecnt 0x0
1504+
; GFX1250-NEXT: s_wait_xcnt 0x0
15041505
; GFX1250-NEXT: s_wait_kmcnt 0x0
15051506
; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_SYS
15061507
; GFX1250-NEXT: s_wait_storecnt 0x0
@@ -1573,6 +1574,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace(
15731574
; GFX1250-NEXT: v_mul_f64_e32 v[0:1], 4.0, v[0:1]
15741575
; GFX1250-NEXT: global_wb scope:SCOPE_DEV
15751576
; GFX1250-NEXT: s_wait_storecnt 0x0
1577+
; GFX1250-NEXT: s_wait_xcnt 0x0
15761578
; GFX1250-NEXT: s_wait_kmcnt 0x0
15771579
; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV
15781580
; GFX1250-NEXT: s_wait_storecnt 0x0
@@ -1647,6 +1649,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace
16471649
; GFX1250-NEXT: v_mul_f64_e32 v[0:1], 4.0, v[0:1]
16481650
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
16491651
; GFX1250-NEXT: s_wait_storecnt 0x0
1652+
; GFX1250-NEXT: s_wait_xcnt 0x0
16501653
; GFX1250-NEXT: s_wait_kmcnt 0x0
16511654
; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_SYS
16521655
; GFX1250-NEXT: s_wait_storecnt 0x0
@@ -1719,6 +1722,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace(
17191722
; GFX1250-NEXT: v_mul_f64_e32 v[0:1], 4.0, v[0:1]
17201723
; GFX1250-NEXT: global_wb scope:SCOPE_DEV
17211724
; GFX1250-NEXT: s_wait_storecnt 0x0
1725+
; GFX1250-NEXT: s_wait_xcnt 0x0
17221726
; GFX1250-NEXT: s_wait_kmcnt 0x0
17231727
; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV
17241728
; GFX1250-NEXT: s_wait_storecnt 0x0
@@ -1909,6 +1913,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrs
19091913
; GFX1250-NEXT: v_mul_f64_e32 v[0:1], 4.0, v[0:1]
19101914
; GFX1250-NEXT: global_wb scope:SCOPE_DEV
19111915
; GFX1250-NEXT: s_wait_storecnt 0x0
1916+
; GFX1250-NEXT: s_wait_xcnt 0x0
19121917
; GFX1250-NEXT: s_wait_kmcnt 0x0
19131918
; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV
19141919
; GFX1250-NEXT: s_wait_storecnt 0x0
@@ -1954,6 +1959,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat(ptr %ptr) #1 {
19541959
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
19551960
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
19561961
; GFX1250-NEXT: s_wait_storecnt 0x0
1962+
; GFX1250-NEXT: s_wait_xcnt 0x0
19571963
; GFX1250-NEXT: s_wait_kmcnt 0x0
19581964
; GFX1250-NEXT: flat_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_SYS
19591965
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
@@ -1996,6 +2002,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent(ptr %ptr) #1 {
19962002
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
19972003
; GFX1250-NEXT: global_wb scope:SCOPE_DEV
19982004
; GFX1250-NEXT: s_wait_storecnt 0x0
2005+
; GFX1250-NEXT: s_wait_xcnt 0x0
19992006
; GFX1250-NEXT: s_wait_kmcnt 0x0
20002007
; GFX1250-NEXT: flat_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV
20012008
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
@@ -2040,6 +2047,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_system(ptr %ptr) #1 {
20402047
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
20412048
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
20422049
; GFX1250-NEXT: s_wait_storecnt 0x0
2050+
; GFX1250-NEXT: s_wait_xcnt 0x0
20432051
; GFX1250-NEXT: s_wait_kmcnt 0x0
20442052
; GFX1250-NEXT: flat_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_SYS
20452053
; GFX1250-NEXT: s_wait_storecnt 0x0
@@ -2202,6 +2210,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent_safe(ptr %ptr) {
22022210
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
22032211
; GFX1250-NEXT: global_wb scope:SCOPE_DEV
22042212
; GFX1250-NEXT: s_wait_storecnt 0x0
2213+
; GFX1250-NEXT: s_wait_xcnt 0x0
22052214
; GFX1250-NEXT: s_wait_kmcnt 0x0
22062215
; GFX1250-NEXT: flat_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV
22072216
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0

llvm/test/CodeGen/AMDGPU/atomics-system-scope.ll

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -364,6 +364,7 @@ define i16 @global_one_as_atomic_min_i16(ptr addrspace(1) %ptr, i16 %val) {
364364
; GFX1250-NEXT: v_lshlrev_b32_e32 v5, v3, v5
365365
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
366366
; GFX1250-NEXT: v_and_or_b32 v6, v7, v4, v5
367+
; GFX1250-NEXT: s_wait_xcnt 0x0
367368
; GFX1250-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[6:7], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
368369
; GFX1250-NEXT: s_wait_loadcnt 0x0
369370
; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
@@ -406,6 +407,7 @@ define i16 @global_one_as_atomic_umin_i16(ptr addrspace(1) %ptr, i16 %val) {
406407
; GFX1250-NEXT: v_lshlrev_b32_e32 v5, v3, v5
407408
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
408409
; GFX1250-NEXT: v_and_or_b32 v6, v7, v4, v5
410+
; GFX1250-NEXT: s_wait_xcnt 0x0
409411
; GFX1250-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[6:7], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
410412
; GFX1250-NEXT: s_wait_loadcnt 0x0
411413
; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
@@ -448,6 +450,7 @@ define i16 @global_one_as_atomic_max_i16(ptr addrspace(1) %ptr, i16 %val) {
448450
; GFX1250-NEXT: v_lshlrev_b32_e32 v5, v3, v5
449451
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
450452
; GFX1250-NEXT: v_and_or_b32 v6, v7, v4, v5
453+
; GFX1250-NEXT: s_wait_xcnt 0x0
451454
; GFX1250-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[6:7], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
452455
; GFX1250-NEXT: s_wait_loadcnt 0x0
453456
; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
@@ -490,6 +493,7 @@ define i16 @global_one_as_atomic_umax_i16(ptr addrspace(1) %ptr, i16 %val) {
490493
; GFX1250-NEXT: v_lshlrev_b32_e32 v5, v3, v5
491494
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
492495
; GFX1250-NEXT: v_and_or_b32 v6, v7, v4, v5
496+
; GFX1250-NEXT: s_wait_xcnt 0x0
493497
; GFX1250-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[6:7], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
494498
; GFX1250-NEXT: s_wait_loadcnt 0x0
495499
; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
@@ -1334,6 +1338,7 @@ define i16 @flat_one_as_atomic_min_i16(ptr %ptr, i16 %val) {
13341338
; GFX1250-NEXT: v_lshlrev_b32_e32 v5, v3, v5
13351339
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
13361340
; GFX1250-NEXT: v_and_or_b32 v6, v7, v4, v5
1341+
; GFX1250-NEXT: s_wait_xcnt 0x0
13371342
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[6:7] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
13381343
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
13391344
; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
@@ -1376,6 +1381,7 @@ define i16 @flat_one_as_atomic_umin_i16(ptr %ptr, i16 %val) {
13761381
; GFX1250-NEXT: v_lshlrev_b32_e32 v5, v3, v5
13771382
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
13781383
; GFX1250-NEXT: v_and_or_b32 v6, v7, v4, v5
1384+
; GFX1250-NEXT: s_wait_xcnt 0x0
13791385
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[6:7] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
13801386
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
13811387
; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
@@ -1418,6 +1424,7 @@ define i16 @flat_one_as_atomic_max_i16(ptr %ptr, i16 %val) {
14181424
; GFX1250-NEXT: v_lshlrev_b32_e32 v5, v3, v5
14191425
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
14201426
; GFX1250-NEXT: v_and_or_b32 v6, v7, v4, v5
1427+
; GFX1250-NEXT: s_wait_xcnt 0x0
14211428
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[6:7] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
14221429
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
14231430
; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
@@ -1460,6 +1467,7 @@ define i16 @flat_one_as_atomic_umax_i16(ptr %ptr, i16 %val) {
14601467
; GFX1250-NEXT: v_lshlrev_b32_e32 v5, v3, v5
14611468
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
14621469
; GFX1250-NEXT: v_and_or_b32 v6, v7, v4, v5
1470+
; GFX1250-NEXT: s_wait_xcnt 0x0
14631471
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[6:7] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
14641472
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
14651473
; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7

llvm/test/CodeGen/AMDGPU/fp-atomics-gfx942.ll

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat(ptr %ptr) {
3838
; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0
3939
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
4040
; GFX1250-NEXT: s_wait_storecnt 0x0
41+
; GFX1250-NEXT: s_wait_xcnt 0x0
4142
; GFX1250-NEXT: s_wait_kmcnt 0x0
4243
; GFX1250-NEXT: flat_atomic_add_f32 v0, v1, s[0:1] scope:SCOPE_SYS
4344
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
@@ -79,6 +80,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat_ieee(ptr %ptr) #0 {
7980
; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0
8081
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
8182
; GFX1250-NEXT: s_wait_storecnt 0x0
83+
; GFX1250-NEXT: s_wait_xcnt 0x0
8284
; GFX1250-NEXT: s_wait_kmcnt 0x0
8385
; GFX1250-NEXT: flat_atomic_add_f32 v0, v1, s[0:1] scope:SCOPE_SYS
8486
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0

llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1473,6 +1473,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt
14731473
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
14741474
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
14751475
; GFX1250-NEXT: s_wait_storecnt 0x0
1476+
; GFX1250-NEXT: s_wait_xcnt 0x0
14761477
; GFX1250-NEXT: s_wait_kmcnt 0x0
14771478
; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_SYS
14781479
; GFX1250-NEXT: s_wait_storecnt 0x0
@@ -1515,6 +1516,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace(
15151516
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
15161517
; GFX1250-NEXT: global_wb scope:SCOPE_DEV
15171518
; GFX1250-NEXT: s_wait_storecnt 0x0
1519+
; GFX1250-NEXT: s_wait_xcnt 0x0
15181520
; GFX1250-NEXT: s_wait_kmcnt 0x0
15191521
; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV
15201522
; GFX1250-NEXT: s_wait_storecnt 0x0
@@ -1559,6 +1561,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace
15591561
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
15601562
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
15611563
; GFX1250-NEXT: s_wait_storecnt 0x0
1564+
; GFX1250-NEXT: s_wait_xcnt 0x0
15621565
; GFX1250-NEXT: s_wait_kmcnt 0x0
15631566
; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_SYS
15641567
; GFX1250-NEXT: s_wait_storecnt 0x0
@@ -1601,6 +1604,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace(
16011604
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
16021605
; GFX1250-NEXT: global_wb scope:SCOPE_DEV
16031606
; GFX1250-NEXT: s_wait_storecnt 0x0
1607+
; GFX1250-NEXT: s_wait_xcnt 0x0
16041608
; GFX1250-NEXT: s_wait_kmcnt 0x0
16051609
; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV
16061610
; GFX1250-NEXT: s_wait_storecnt 0x0
@@ -1772,6 +1776,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrs
17721776
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
17731777
; GFX1250-NEXT: global_wb scope:SCOPE_DEV
17741778
; GFX1250-NEXT: s_wait_storecnt 0x0
1779+
; GFX1250-NEXT: s_wait_xcnt 0x0
17751780
; GFX1250-NEXT: s_wait_kmcnt 0x0
17761781
; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV
17771782
; GFX1250-NEXT: s_wait_storecnt 0x0
@@ -1816,6 +1821,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat(ptr %ptr) #1 {
18161821
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
18171822
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
18181823
; GFX1250-NEXT: s_wait_storecnt 0x0
1824+
; GFX1250-NEXT: s_wait_xcnt 0x0
18191825
; GFX1250-NEXT: s_wait_kmcnt 0x0
18201826
; GFX1250-NEXT: flat_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_SYS
18211827
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
@@ -1858,6 +1864,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent(ptr %ptr) #1 {
18581864
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
18591865
; GFX1250-NEXT: global_wb scope:SCOPE_DEV
18601866
; GFX1250-NEXT: s_wait_storecnt 0x0
1867+
; GFX1250-NEXT: s_wait_xcnt 0x0
18611868
; GFX1250-NEXT: s_wait_kmcnt 0x0
18621869
; GFX1250-NEXT: flat_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV
18631870
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
@@ -1902,6 +1909,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_system(ptr %ptr) #1 {
19021909
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
19031910
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
19041911
; GFX1250-NEXT: s_wait_storecnt 0x0
1912+
; GFX1250-NEXT: s_wait_xcnt 0x0
19051913
; GFX1250-NEXT: s_wait_kmcnt 0x0
19061914
; GFX1250-NEXT: flat_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_SYS
19071915
; GFX1250-NEXT: s_wait_storecnt 0x0
@@ -2075,6 +2083,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent_safe(ptr %ptr) {
20752083
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
20762084
; GFX1250-NEXT: global_wb scope:SCOPE_DEV
20772085
; GFX1250-NEXT: s_wait_storecnt 0x0
2086+
; GFX1250-NEXT: s_wait_xcnt 0x0
20782087
; GFX1250-NEXT: s_wait_kmcnt 0x0
20792088
; GFX1250-NEXT: flat_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV
20802089
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0

0 commit comments

Comments
 (0)