Skip to content

Commit 54f7d23

Browse files
committed
Merge amd-mi400 into amd-gfx13
2 parents 5299d5a + 12a3fe2 commit 54f7d23

29 files changed

+1619
-31
lines changed

llvm/lib/Target/AMDGPU/SIInstrInfo.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1121,6 +1121,8 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
11211121
return AMDGPU::S_WAIT_DSCNT;
11221122
case AMDGPU::S_WAIT_KMCNT_soft:
11231123
return AMDGPU::S_WAIT_KMCNT;
1124+
case AMDGPU::S_WAIT_XCNT_soft:
1125+
return AMDGPU::S_WAIT_XCNT;
11241126
default:
11251127
return Opcode;
11261128
}

llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp

Lines changed: 42 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -336,8 +336,12 @@ class SICacheControl {
336336
bool IsNonTemporal,
337337
bool IsLastUse = false) const = 0;
338338

339-
virtual bool finalizeStore(MachineBasicBlock::iterator &MI,
340-
bool Atomic) const {
339+
/// Add final touches to a `mayStore` instruction \p MI, which may be a
340+
/// Store or RMW instruction.
341+
/// FIXME: This takes a MI because iterators aren't handled properly. When
342+
/// this is called, they often point to entirely different insts. Thus we back
343+
/// up the inst early and pass it here instead.
344+
virtual bool finalizeStore(MachineInstr &MI, bool Atomic) const {
341345
return false;
342346
};
343347

@@ -636,8 +640,7 @@ class SIGfx12CacheControl : public SIGfx11CacheControl {
636640
bool IsVolatile, bool IsNonTemporal,
637641
bool IsLastUse) const override;
638642

639-
bool finalizeStore(MachineBasicBlock::iterator &MI,
640-
bool Atomic) const override;
643+
bool finalizeStore(MachineInstr &MI, bool Atomic) const override;
641644

642645
virtual bool handleCooperativeAtomic(MachineInstr &MI) const override;
643646

@@ -2633,10 +2636,6 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
26332636
if (IsVolatile) {
26342637
Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SYS);
26352638

2636-
if (Op == SIMemOp::STORE && !ST.hasGFX1250Insts() &&
2637-
TII->getNamedOperand(*MI, OpName::cpol))
2638-
Changed |= insertWaitsBeforeSystemScopeStore(MI);
2639-
26402639
// Ensure operation has completed at system scope to cause all volatile
26412640
// operations to be visible outside the program in a global order. Do not
26422641
// request cross address space as only the global address space can be
@@ -2649,39 +2648,48 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
26492648
return Changed;
26502649
}
26512650

2652-
bool SIGfx12CacheControl::finalizeStore(MachineBasicBlock::iterator &MI,
2653-
bool Atomic) const {
2651+
bool SIGfx12CacheControl::finalizeStore(MachineInstr &MI, bool Atomic) const {
26542652
// Only required on gfx120x.
2655-
auto CoreMI = &*MI;
2656-
if (MI->isBundle()) {
2657-
CoreMI = SIInstrInfo::bundleWithGPRIndexing(*MI);
2653+
auto CoreMI = &MI;
2654+
if (MI.isBundle()) {
2655+
CoreMI = SIInstrInfo::bundleWithGPRIndexing(MI);
26582656
assert(CoreMI);
26592657
}
26602658

2661-
// GFX120x specific: we must add waits before a system scope store.
2662-
MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol);
2663-
if (!CPol)
2664-
return false;
2659+
assert(MI.mayStore() && "Not a Store inst");
2660+
const bool IsRMW = (MI.mayLoad() && MI.mayStore());
2661+
bool Changed = false;
26652662

2666-
// No scope operand means SCOPE_CU.
2663+
// GFX125x only: xcnt wait is needed before atomics stores/rmw
2664+
if (Atomic && ST.hasWaitXCnt()) {
2665+
// TODO: add isAtomic once I figured out this bug.
2666+
MachineBasicBlock &MBB = *MI.getParent();
2667+
BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(S_WAIT_XCNT_soft)).addImm(0);
2668+
Changed = true;
2669+
}
2670+
2671+
// Remaining fixes do not apply to RMWs
2672+
if (IsRMW)
2673+
return Changed;
2674+
2675+
MachineOperand *CPol = TII->getNamedOperand(MI, OpName::cpol);
2676+
if (!CPol) // Some vmem operations do not have a scope and are not concerned.
2677+
return Changed;
26672678
const unsigned Scope = CPol->getImm() & CPol::SCOPE;
26682679

2669-
// GFX120x only: Extra waits needed before system scope stores.
2680+
// GFX120x only: Extra waits needed before non-atomic system scope stores.
26702681
if (!ST.hasGFX1250Insts()) {
26712682
if (!Atomic && Scope == CPol::SCOPE_SYS)
2672-
return insertWaitsBeforeSystemScopeStore(MI);
2673-
return false;
2683+
Changed |= insertWaitsBeforeSystemScopeStore(MI.getIterator());
2684+
return Changed;
26742685
}
26752686

2676-
// GFX1250 only: Require SCOPE_SE on stores that may hit the scratch address
2687+
// GFX125x only: Require SCOPE_SE on stores that may hit the scratch address
26772688
// space, or if the "cu-stores" target feature is disabled.
2678-
if (Scope != CPol::SCOPE_CU)
2679-
return false;
2680-
2681-
if (!ST.hasCUStores() || TII->mayAccessScratchThroughFlat(*MI))
2682-
return setScope(MI, CPol::SCOPE_SE);
2683-
2684-
return false;
2689+
if (Scope == CPol::SCOPE_CU &&
2690+
(!ST.hasCUStores() || TII->mayAccessScratchThroughFlat(MI)))
2691+
Changed |= setScope(MI, CPol::SCOPE_SE);
2692+
return Changed;
26852693
}
26862694

26872695
bool SIGfx12CacheControl::handleCooperativeAtomic(MachineInstr &MI) const {
@@ -2826,6 +2834,7 @@ bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
28262834
MachineBasicBlock::iterator &MI) {
28272835

28282836
bool Changed = false;
2837+
MachineInstr &StoreMI = *MI;
28292838

28302839
if (MOI.isAtomic()) {
28312840
if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
@@ -2847,7 +2856,7 @@ bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
28472856
MOI.getIsCrossAddressSpaceOrdering(),
28482857
Position::BEFORE);
28492858

2850-
Changed |= CC->finalizeStore(MI, /*Atomic=*/true);
2859+
Changed |= CC->finalizeStore(StoreMI, /*Atomic=*/true);
28512860
return Changed;
28522861
}
28532862

@@ -2858,7 +2867,7 @@ bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
28582867
MI, MOI.getInstrAddrSpace(), SIMemOp::STORE, MOI.isVolatile(),
28592868
MOI.isNonTemporal());
28602869

2861-
Changed |= CC->finalizeStore(MI, /*Atomic=*/false);
2870+
Changed |= CC->finalizeStore(StoreMI, /*Atomic=*/false);
28622871

28632872
Changed |= CC->setCFS(MI, MOI.getCFS());
28642873
return Changed;
@@ -2917,6 +2926,7 @@ bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
29172926
bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(
29182927
const SIMemOpInfo &MOI, MachineBasicBlock::iterator &MI) {
29192928
bool Changed = false;
2929+
MachineInstr &RMWMI = *MI;
29202930

29212931
if (MOI.isAtomic()) {
29222932
const AtomicOrdering Order = MOI.getOrdering();
@@ -2951,6 +2961,7 @@ bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(
29512961
Position::AFTER);
29522962
}
29532963

2964+
Changed |= CC->finalizeStore(RMWMI, /*Atomic=*/true);
29542965
return Changed;
29552966
}
29562967

llvm/lib/Target/AMDGPU/SOPInstructions.td

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1728,6 +1728,10 @@ let OtherPredicates = [HasImageInsts] in {
17281728
def S_WAIT_KMCNT_soft : SOPP_Pseudo <"s_soft_wait_kmcnt", (ins s16imm:$simm16), "$simm16">;
17291729
}
17301730

1731+
let SubtargetPredicate = HasWaitXcnt in {
1732+
def S_WAIT_XCNT_soft : SOPP_Pseudo<"s_soft_wait_xcnt", (ins s16imm:$simm16), "$simm16">;
1733+
}
1734+
17311735
def S_SETHALT : SOPP_Pseudo <"s_sethalt" , (ins i32imm:$simm16), "$simm16",
17321736
[(int_amdgcn_s_sethalt timm:$simm16)]>;
17331737
def S_SETKILL : SOPP_Pseudo <"s_setkill" , (ins i16imm:$simm16), "$simm16"> {

llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1501,6 +1501,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt
15011501
; GFX1250-NEXT: v_mul_f64_e32 v[0:1], 4.0, v[0:1]
15021502
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
15031503
; GFX1250-NEXT: s_wait_storecnt 0x0
1504+
; GFX1250-NEXT: s_wait_xcnt 0x0
15041505
; GFX1250-NEXT: s_wait_kmcnt 0x0
15051506
; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_SYS
15061507
; GFX1250-NEXT: s_wait_storecnt 0x0
@@ -1573,6 +1574,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace(
15731574
; GFX1250-NEXT: v_mul_f64_e32 v[0:1], 4.0, v[0:1]
15741575
; GFX1250-NEXT: global_wb scope:SCOPE_DEV
15751576
; GFX1250-NEXT: s_wait_storecnt 0x0
1577+
; GFX1250-NEXT: s_wait_xcnt 0x0
15761578
; GFX1250-NEXT: s_wait_kmcnt 0x0
15771579
; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV
15781580
; GFX1250-NEXT: s_wait_storecnt 0x0
@@ -1647,6 +1649,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace
16471649
; GFX1250-NEXT: v_mul_f64_e32 v[0:1], 4.0, v[0:1]
16481650
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
16491651
; GFX1250-NEXT: s_wait_storecnt 0x0
1652+
; GFX1250-NEXT: s_wait_xcnt 0x0
16501653
; GFX1250-NEXT: s_wait_kmcnt 0x0
16511654
; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_SYS
16521655
; GFX1250-NEXT: s_wait_storecnt 0x0
@@ -1719,6 +1722,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace(
17191722
; GFX1250-NEXT: v_mul_f64_e32 v[0:1], 4.0, v[0:1]
17201723
; GFX1250-NEXT: global_wb scope:SCOPE_DEV
17211724
; GFX1250-NEXT: s_wait_storecnt 0x0
1725+
; GFX1250-NEXT: s_wait_xcnt 0x0
17221726
; GFX1250-NEXT: s_wait_kmcnt 0x0
17231727
; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV
17241728
; GFX1250-NEXT: s_wait_storecnt 0x0
@@ -1909,6 +1913,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrs
19091913
; GFX1250-NEXT: v_mul_f64_e32 v[0:1], 4.0, v[0:1]
19101914
; GFX1250-NEXT: global_wb scope:SCOPE_DEV
19111915
; GFX1250-NEXT: s_wait_storecnt 0x0
1916+
; GFX1250-NEXT: s_wait_xcnt 0x0
19121917
; GFX1250-NEXT: s_wait_kmcnt 0x0
19131918
; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV
19141919
; GFX1250-NEXT: s_wait_storecnt 0x0
@@ -1954,6 +1959,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat(ptr %ptr) #1 {
19541959
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
19551960
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
19561961
; GFX1250-NEXT: s_wait_storecnt 0x0
1962+
; GFX1250-NEXT: s_wait_xcnt 0x0
19571963
; GFX1250-NEXT: s_wait_kmcnt 0x0
19581964
; GFX1250-NEXT: flat_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_SYS
19591965
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
@@ -1996,6 +2002,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent(ptr %ptr) #1 {
19962002
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
19972003
; GFX1250-NEXT: global_wb scope:SCOPE_DEV
19982004
; GFX1250-NEXT: s_wait_storecnt 0x0
2005+
; GFX1250-NEXT: s_wait_xcnt 0x0
19992006
; GFX1250-NEXT: s_wait_kmcnt 0x0
20002007
; GFX1250-NEXT: flat_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV
20012008
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
@@ -2040,6 +2047,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_system(ptr %ptr) #1 {
20402047
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
20412048
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
20422049
; GFX1250-NEXT: s_wait_storecnt 0x0
2050+
; GFX1250-NEXT: s_wait_xcnt 0x0
20432051
; GFX1250-NEXT: s_wait_kmcnt 0x0
20442052
; GFX1250-NEXT: flat_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_SYS
20452053
; GFX1250-NEXT: s_wait_storecnt 0x0
@@ -2202,6 +2210,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent_safe(ptr %ptr) {
22022210
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
22032211
; GFX1250-NEXT: global_wb scope:SCOPE_DEV
22042212
; GFX1250-NEXT: s_wait_storecnt 0x0
2213+
; GFX1250-NEXT: s_wait_xcnt 0x0
22052214
; GFX1250-NEXT: s_wait_kmcnt 0x0
22062215
; GFX1250-NEXT: flat_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV
22072216
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0

llvm/test/CodeGen/AMDGPU/atomics-system-scope.ll

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -364,6 +364,7 @@ define i16 @global_one_as_atomic_min_i16(ptr addrspace(1) %ptr, i16 %val) {
364364
; GFX1250-NEXT: v_lshlrev_b32_e32 v5, v3, v5
365365
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
366366
; GFX1250-NEXT: v_and_or_b32 v6, v7, v4, v5
367+
; GFX1250-NEXT: s_wait_xcnt 0x0
367368
; GFX1250-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[6:7], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
368369
; GFX1250-NEXT: s_wait_loadcnt 0x0
369370
; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
@@ -406,6 +407,7 @@ define i16 @global_one_as_atomic_umin_i16(ptr addrspace(1) %ptr, i16 %val) {
406407
; GFX1250-NEXT: v_lshlrev_b32_e32 v5, v3, v5
407408
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
408409
; GFX1250-NEXT: v_and_or_b32 v6, v7, v4, v5
410+
; GFX1250-NEXT: s_wait_xcnt 0x0
409411
; GFX1250-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[6:7], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
410412
; GFX1250-NEXT: s_wait_loadcnt 0x0
411413
; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
@@ -448,6 +450,7 @@ define i16 @global_one_as_atomic_max_i16(ptr addrspace(1) %ptr, i16 %val) {
448450
; GFX1250-NEXT: v_lshlrev_b32_e32 v5, v3, v5
449451
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
450452
; GFX1250-NEXT: v_and_or_b32 v6, v7, v4, v5
453+
; GFX1250-NEXT: s_wait_xcnt 0x0
451454
; GFX1250-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[6:7], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
452455
; GFX1250-NEXT: s_wait_loadcnt 0x0
453456
; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
@@ -490,6 +493,7 @@ define i16 @global_one_as_atomic_umax_i16(ptr addrspace(1) %ptr, i16 %val) {
490493
; GFX1250-NEXT: v_lshlrev_b32_e32 v5, v3, v5
491494
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
492495
; GFX1250-NEXT: v_and_or_b32 v6, v7, v4, v5
496+
; GFX1250-NEXT: s_wait_xcnt 0x0
493497
; GFX1250-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[6:7], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
494498
; GFX1250-NEXT: s_wait_loadcnt 0x0
495499
; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
@@ -1334,6 +1338,7 @@ define i16 @flat_one_as_atomic_min_i16(ptr %ptr, i16 %val) {
13341338
; GFX1250-NEXT: v_lshlrev_b32_e32 v5, v3, v5
13351339
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
13361340
; GFX1250-NEXT: v_and_or_b32 v6, v7, v4, v5
1341+
; GFX1250-NEXT: s_wait_xcnt 0x0
13371342
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[6:7] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
13381343
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
13391344
; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
@@ -1376,6 +1381,7 @@ define i16 @flat_one_as_atomic_umin_i16(ptr %ptr, i16 %val) {
13761381
; GFX1250-NEXT: v_lshlrev_b32_e32 v5, v3, v5
13771382
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
13781383
; GFX1250-NEXT: v_and_or_b32 v6, v7, v4, v5
1384+
; GFX1250-NEXT: s_wait_xcnt 0x0
13791385
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[6:7] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
13801386
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
13811387
; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
@@ -1418,6 +1424,7 @@ define i16 @flat_one_as_atomic_max_i16(ptr %ptr, i16 %val) {
14181424
; GFX1250-NEXT: v_lshlrev_b32_e32 v5, v3, v5
14191425
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
14201426
; GFX1250-NEXT: v_and_or_b32 v6, v7, v4, v5
1427+
; GFX1250-NEXT: s_wait_xcnt 0x0
14211428
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[6:7] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
14221429
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
14231430
; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
@@ -1460,6 +1467,7 @@ define i16 @flat_one_as_atomic_umax_i16(ptr %ptr, i16 %val) {
14601467
; GFX1250-NEXT: v_lshlrev_b32_e32 v5, v3, v5
14611468
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
14621469
; GFX1250-NEXT: v_and_or_b32 v6, v7, v4, v5
1470+
; GFX1250-NEXT: s_wait_xcnt 0x0
14631471
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[6:7] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
14641472
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
14651473
; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7

llvm/test/CodeGen/AMDGPU/fp-atomics-gfx942.ll

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat(ptr %ptr) {
3838
; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0
3939
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
4040
; GFX1250-NEXT: s_wait_storecnt 0x0
41+
; GFX1250-NEXT: s_wait_xcnt 0x0
4142
; GFX1250-NEXT: s_wait_kmcnt 0x0
4243
; GFX1250-NEXT: flat_atomic_add_f32 v0, v1, s[0:1] scope:SCOPE_SYS
4344
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
@@ -79,6 +80,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat_ieee(ptr %ptr) #0 {
7980
; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0
8081
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
8182
; GFX1250-NEXT: s_wait_storecnt 0x0
83+
; GFX1250-NEXT: s_wait_xcnt 0x0
8284
; GFX1250-NEXT: s_wait_kmcnt 0x0
8385
; GFX1250-NEXT: flat_atomic_add_f32 v0, v1, s[0:1] scope:SCOPE_SYS
8486
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0

llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1473,6 +1473,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt
14731473
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
14741474
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
14751475
; GFX1250-NEXT: s_wait_storecnt 0x0
1476+
; GFX1250-NEXT: s_wait_xcnt 0x0
14761477
; GFX1250-NEXT: s_wait_kmcnt 0x0
14771478
; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_SYS
14781479
; GFX1250-NEXT: s_wait_storecnt 0x0
@@ -1515,6 +1516,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace(
15151516
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
15161517
; GFX1250-NEXT: global_wb scope:SCOPE_DEV
15171518
; GFX1250-NEXT: s_wait_storecnt 0x0
1519+
; GFX1250-NEXT: s_wait_xcnt 0x0
15181520
; GFX1250-NEXT: s_wait_kmcnt 0x0
15191521
; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV
15201522
; GFX1250-NEXT: s_wait_storecnt 0x0
@@ -1559,6 +1561,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace
15591561
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
15601562
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
15611563
; GFX1250-NEXT: s_wait_storecnt 0x0
1564+
; GFX1250-NEXT: s_wait_xcnt 0x0
15621565
; GFX1250-NEXT: s_wait_kmcnt 0x0
15631566
; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_SYS
15641567
; GFX1250-NEXT: s_wait_storecnt 0x0
@@ -1601,6 +1604,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace(
16011604
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
16021605
; GFX1250-NEXT: global_wb scope:SCOPE_DEV
16031606
; GFX1250-NEXT: s_wait_storecnt 0x0
1607+
; GFX1250-NEXT: s_wait_xcnt 0x0
16041608
; GFX1250-NEXT: s_wait_kmcnt 0x0
16051609
; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV
16061610
; GFX1250-NEXT: s_wait_storecnt 0x0
@@ -1772,6 +1776,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrs
17721776
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
17731777
; GFX1250-NEXT: global_wb scope:SCOPE_DEV
17741778
; GFX1250-NEXT: s_wait_storecnt 0x0
1779+
; GFX1250-NEXT: s_wait_xcnt 0x0
17751780
; GFX1250-NEXT: s_wait_kmcnt 0x0
17761781
; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV
17771782
; GFX1250-NEXT: s_wait_storecnt 0x0
@@ -1816,6 +1821,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat(ptr %ptr) #1 {
18161821
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
18171822
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
18181823
; GFX1250-NEXT: s_wait_storecnt 0x0
1824+
; GFX1250-NEXT: s_wait_xcnt 0x0
18191825
; GFX1250-NEXT: s_wait_kmcnt 0x0
18201826
; GFX1250-NEXT: flat_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_SYS
18211827
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
@@ -1858,6 +1864,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent(ptr %ptr) #1 {
18581864
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
18591865
; GFX1250-NEXT: global_wb scope:SCOPE_DEV
18601866
; GFX1250-NEXT: s_wait_storecnt 0x0
1867+
; GFX1250-NEXT: s_wait_xcnt 0x0
18611868
; GFX1250-NEXT: s_wait_kmcnt 0x0
18621869
; GFX1250-NEXT: flat_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV
18631870
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
@@ -1902,6 +1909,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_system(ptr %ptr) #1 {
19021909
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
19031910
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
19041911
; GFX1250-NEXT: s_wait_storecnt 0x0
1912+
; GFX1250-NEXT: s_wait_xcnt 0x0
19051913
; GFX1250-NEXT: s_wait_kmcnt 0x0
19061914
; GFX1250-NEXT: flat_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_SYS
19071915
; GFX1250-NEXT: s_wait_storecnt 0x0
@@ -2075,6 +2083,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent_safe(ptr %ptr) {
20752083
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
20762084
; GFX1250-NEXT: global_wb scope:SCOPE_DEV
20772085
; GFX1250-NEXT: s_wait_storecnt 0x0
2086+
; GFX1250-NEXT: s_wait_xcnt 0x0
20782087
; GFX1250-NEXT: s_wait_kmcnt 0x0
20792088
; GFX1250-NEXT: flat_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV
20802089
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0

0 commit comments

Comments
 (0)