diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index 306d59d0867cd..70db7b4918515 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -531,6 +531,7 @@ class SIInsertWaitcnts { // instruction. WaitEventType getVmemWaitEventType(const MachineInstr &Inst) const { switch (Inst.getOpcode()) { + // FIXME: GLOBAL_INV needs to be tracked with xcnt too. case AMDGPU::GLOBAL_INV: return VMEM_READ_ACCESS; // tracked using loadcnt case AMDGPU::GLOBAL_WB: @@ -633,8 +634,11 @@ class WaitcntBrackets { const MachineOperand &Op) const; bool counterOutOfOrder(InstCounterType T) const; - void simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const; + void simplifyWaitcnt(AMDGPU::Waitcnt &Wait); void simplifyWaitcnt(InstCounterType T, unsigned &Count) const; + bool hasRedundantXCntWithKmCnt(const AMDGPU::Waitcnt &Wait); + bool canOptimizeXCntWithLoadCnt(const AMDGPU::Waitcnt &Wait); + void simplifyXcnt(AMDGPU::Waitcnt &CheckWait, AMDGPU::Waitcnt &UpdateWait); void determineWait(InstCounterType T, RegInterval Interval, AMDGPU::Waitcnt &Wait) const; @@ -646,7 +650,6 @@ class WaitcntBrackets { void applyWaitcnt(const AMDGPU::Waitcnt &Wait); void applyWaitcnt(InstCounterType T, unsigned Count); - void applyXcnt(const AMDGPU::Waitcnt &Wait); void updateByEvent(WaitEventType E, MachineInstr &MI); unsigned hasPendingEvent() const { return PendingEvents; } @@ -1192,7 +1195,7 @@ void WaitcntBrackets::print(raw_ostream &OS) const { /// Simplify the waitcnt, in the sense of removing redundant counts, and return /// whether a waitcnt instruction is needed at all. -void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const { +void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) { simplifyWaitcnt(LOAD_CNT, Wait.LoadCnt); simplifyWaitcnt(EXP_CNT, Wait.ExpCnt); simplifyWaitcnt(DS_CNT, Wait.DsCnt); @@ -1200,7 +1203,7 @@ void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const { simplifyWaitcnt(SAMPLE_CNT, Wait.SampleCnt); simplifyWaitcnt(BVH_CNT, Wait.BvhCnt); simplifyWaitcnt(KM_CNT, Wait.KmCnt); - simplifyWaitcnt(X_CNT, Wait.XCnt); + simplifyXcnt(Wait, Wait); } void WaitcntBrackets::simplifyWaitcnt(InstCounterType T, @@ -1270,7 +1273,7 @@ void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) { applyWaitcnt(SAMPLE_CNT, Wait.SampleCnt); applyWaitcnt(BVH_CNT, Wait.BvhCnt); applyWaitcnt(KM_CNT, Wait.KmCnt); - applyXcnt(Wait); + applyWaitcnt(X_CNT, Wait.XCnt); } void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) { @@ -1287,35 +1290,42 @@ void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) { } } -void WaitcntBrackets::applyXcnt(const AMDGPU::Waitcnt &Wait) { - // On entry to a block with multiple predescessors, there may - // be pending SMEM and VMEM events active at the same time. - // In such cases, only clear one active event at a time. - +bool WaitcntBrackets::hasRedundantXCntWithKmCnt(const AMDGPU::Waitcnt &Wait) { // Wait on XCNT is redundant if we are already waiting for a load to complete. // SMEM can return out of order, so only omit XCNT wait if we are waiting till // zero. - if (Wait.KmCnt == 0 && hasPendingEvent(SMEM_GROUP)) { - if (!hasMixedPendingEvents(X_CNT)) - applyWaitcnt(X_CNT, 0); - else - PendingEvents &= ~(1 << SMEM_GROUP); - return; - } + return Wait.KmCnt == 0 && hasPendingEvent(SMEM_GROUP); +} +bool WaitcntBrackets::canOptimizeXCntWithLoadCnt(const AMDGPU::Waitcnt &Wait) { // If we have pending store we cannot optimize XCnt because we do not wait for // stores. VMEM loads retun in order, so if we only have loads XCnt is // decremented to the same number as LOADCnt. - if (Wait.LoadCnt != ~0u && hasPendingEvent(VMEM_GROUP) && - !hasPendingEvent(STORE_CNT)) { - if (!hasMixedPendingEvents(X_CNT)) - applyWaitcnt(X_CNT, std::min(Wait.XCnt, Wait.LoadCnt)); - else if (Wait.LoadCnt == 0) + return Wait.LoadCnt != ~0u && hasPendingEvent(VMEM_GROUP) && + !hasPendingEvent(STORE_CNT); +} + +void WaitcntBrackets::simplifyXcnt(AMDGPU::Waitcnt &CheckWait, + AMDGPU::Waitcnt &UpdateWait) { + // Try to simplify xcnt further by checking for joint kmcnt and loadcnt + // optimizations. On entry to a block with multiple predescessors, there may + // be pending SMEM and VMEM events active at the same time. + // In such cases, only clear one active event at a time. + // TODO: Revisit xcnt optimizations for gfx1250. + if (hasRedundantXCntWithKmCnt(CheckWait)) { + if (!hasMixedPendingEvents(X_CNT)) { + applyWaitcnt(X_CNT, 0); + } else { + PendingEvents &= ~(1 << SMEM_GROUP); + } + } else if (canOptimizeXCntWithLoadCnt(CheckWait)) { + if (!hasMixedPendingEvents(X_CNT)) { + applyWaitcnt(X_CNT, std::min(CheckWait.XCnt, CheckWait.LoadCnt)); + } else if (CheckWait.LoadCnt == 0) { PendingEvents &= ~(1 << VMEM_GROUP); - return; + } } - - applyWaitcnt(X_CNT, Wait.XCnt); + simplifyWaitcnt(X_CNT, UpdateWait.XCnt); } // Where there are multiple types of event in the bracket of a counter, @@ -1650,6 +1660,8 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt( } } + // Save the pre combine waitcnt in order to make xcnt checks. + AMDGPU::Waitcnt PreCombine = Wait; if (CombinedLoadDsCntInstr) { // Only keep an S_WAIT_LOADCNT_DSCNT if both counters actually need // to be waited for. Otherwise, let the instruction be deleted so @@ -1740,6 +1752,13 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt( } for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) { + if ((CT == KM_CNT && ScoreBrackets.hasRedundantXCntWithKmCnt(PreCombine)) || + (CT == LOAD_CNT && + ScoreBrackets.canOptimizeXCntWithLoadCnt(PreCombine))) { + // Xcnt may need to be updated depending on a pre-existing KM/LOAD_CNT + // due to taking the backedge of a block. + ScoreBrackets.simplifyXcnt(PreCombine, Wait); + } if (!WaitInstrs[CT]) continue; @@ -2086,6 +2105,14 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, // Verify that the wait is actually needed. ScoreBrackets.simplifyWaitcnt(Wait); + // Since the translation for VMEM addresses occur in-order, we can apply the + // XCnt if the current instruction is of VMEM type and has a memory + // dependency with another VMEM instruction in flight. + if (Wait.XCnt != ~0u && isVmemAccess(MI)) { + ScoreBrackets.applyWaitcnt(X_CNT, Wait.XCnt); + Wait.XCnt = ~0u; + } + // When forcing emit, we need to skip terminators because that would break the // terminators of the MBB if we emit a waitcnt between terminators. if (ForceEmitZeroFlag && !MI.isTerminator()) @@ -2154,21 +2181,6 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait, << "Update Instr: " << *It); } - // XCnt may be already consumed by a load wait. - if (Wait.XCnt != ~0u) { - if (Wait.KmCnt == 0 && !ScoreBrackets.hasPendingEvent(SMEM_GROUP)) - Wait.XCnt = ~0u; - - if (Wait.LoadCnt == 0 && !ScoreBrackets.hasPendingEvent(VMEM_GROUP)) - Wait.XCnt = ~0u; - - // Since the translation for VMEM addresses occur in-order, we can skip the - // XCnt if the current instruction is of VMEM type and has a memory - // dependency with another VMEM instruction in flight. - if (isVmemAccess(*It)) - Wait.XCnt = ~0u; - } - if (WCG->createNewWaitcnt(Block, It, Wait)) Modified = true; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll index 7e297f46a780e..b5d593a9c15ed 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll @@ -1501,7 +1501,6 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt ; GFX1250-NEXT: v_mul_f64_e32 v[0:1], 4.0, v[0:1] ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -1574,7 +1573,6 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace( ; GFX1250-NEXT: v_mul_f64_e32 v[0:1], 4.0, v[0:1] ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -1649,7 +1647,6 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace ; GFX1250-NEXT: v_mul_f64_e32 v[0:1], 4.0, v[0:1] ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -1722,7 +1719,6 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace( ; GFX1250-NEXT: v_mul_f64_e32 v[0:1], 4.0, v[0:1] ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -1913,7 +1909,6 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrs ; GFX1250-NEXT: v_mul_f64_e32 v[0:1], 4.0, v[0:1] ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -1959,7 +1954,6 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat(ptr %ptr) #1 { ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 @@ -2002,7 +1996,6 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent(ptr %ptr) #1 { ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 @@ -2047,7 +2040,6 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_system(ptr %ptr) #1 { ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -2210,7 +2202,6 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent_safe(ptr %ptr) { ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll index 393e9fecbb308..28d7e6916e519 100644 --- a/llvm/test/CodeGen/AMDGPU/bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/bf16.ll @@ -2520,6 +2520,7 @@ define void @test_load_store_f32_to_bf16(ptr addrspace(1) %in, ptr addrspace(1) ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_load_b32 v0, v[0:1], off ; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 ; GFX1250-NEXT: global_store_b16 v[2:3], v0, off ; GFX1250-NEXT: s_set_pc_i64 s[30:31] @@ -2783,6 +2784,7 @@ define void @test_load_store_bf16_to_f32(ptr addrspace(1) %in, ptr addrspace(1) ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_load_u16 v0, v[0:1], off ; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX1250-NEXT: global_store_b32 v[2:3], v0, off ; GFX1250-NEXT: s_set_pc_i64 s[30:31] @@ -2872,6 +2874,7 @@ define void @test_load_store_bf16_to_f64(ptr addrspace(1) %in, ptr addrspace(1) ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_load_u16 v0, v[0:1], off ; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 @@ -6850,6 +6853,7 @@ define <2 x float> @global_extload_v2bf16_to_v2f32(ptr addrspace(1) %ptr) { ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_load_b32 v1, v[0:1], off ; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v1 ; GFX1250-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] @@ -6943,6 +6947,7 @@ define <3 x float> @global_extload_v3bf16_to_v3f32(ptr addrspace(1) %ptr) { ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_load_b64 v[2:3], v[0:1], off ; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v2 ; GFX1250-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 ; GFX1250-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -7033,6 +7038,7 @@ define <4 x float> @global_extload_v4bf16_to_v4f32(ptr addrspace(1) %ptr) { ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_load_b64 v[2:3], v[0:1], off ; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v2 ; GFX1250-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 ; GFX1250-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -7134,6 +7140,7 @@ define <5 x float> @global_extload_v5bf16_to_v5f32(ptr addrspace(1) %ptr) { ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_load_b128 v[2:5], v[0:1], off ; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v2 ; GFX1250-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 ; GFX1250-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -7251,6 +7258,7 @@ define <6 x float> @global_extload_v6bf16_to_v6f32(ptr addrspace(1) %ptr) { ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_load_b96 v[4:6], v[0:1], off ; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: v_dual_lshlrev_b32 v0, 16, v4 :: v_dual_lshlrev_b32 v2, 16, v5 ; GFX1250-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 ; GFX1250-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 @@ -7367,6 +7375,7 @@ define <8 x float> @global_extload_v8bf16_to_v8f32(ptr addrspace(1) %ptr) { ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_load_b128 v[4:7], v[0:1], off ; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: v_dual_lshlrev_b32 v0, 16, v4 :: v_dual_lshlrev_b32 v2, 16, v5 ; GFX1250-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 ; GFX1250-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 @@ -8001,6 +8010,7 @@ define <2 x double> @global_extload_v2bf16_to_v2f64(ptr addrspace(1) %ptr) { ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_load_b32 v0, v[0:1], off ; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX1250-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) @@ -8241,6 +8251,7 @@ define <4 x double> @global_extload_v4bf16_to_v4f64(ptr addrspace(1) %ptr) { ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_load_b64 v[2:3], v[0:1], off ; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: v_dual_lshlrev_b32 v0, 16, v2 :: v_dual_lshlrev_b32 v4, 16, v3 ; GFX1250-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX1250-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 @@ -8377,6 +8388,7 @@ define <5 x double> @global_extload_v5bf16_to_v5f64(ptr addrspace(1) %ptr) { ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_load_b128 v[2:5], v[0:1], off ; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: v_dual_lshlrev_b32 v0, 16, v2 :: v_dual_lshlrev_b32 v5, 16, v3 ; GFX1250-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX1250-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 @@ -8522,6 +8534,7 @@ define <6 x double> @global_extload_v6bf16_to_v6f64(ptr addrspace(1) %ptr) { ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_load_b96 v[4:6], v[0:1], off ; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v4 ; GFX1250-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 ; GFX1250-NEXT: v_lshlrev_b32_e32 v4, 16, v5 @@ -8693,6 +8706,7 @@ define <8 x double> @global_extload_v8bf16_to_v8f64(ptr addrspace(1) %ptr) { ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_load_b128 v[8:11], v[0:1], off ; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: v_dual_lshlrev_b32 v0, 16, v8 :: v_dual_lshlrev_b32 v4, 16, v9 ; GFX1250-NEXT: v_and_b32_e32 v2, 0xffff0000, v8 ; GFX1250-NEXT: v_and_b32_e32 v6, 0xffff0000, v9 diff --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll index f465e3c505c02..31307b245bafe 100644 --- a/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll @@ -152,7 +152,6 @@ define amdgpu_kernel void @min_long_forward_vbranch(ptr addrspace(1) %arg) #0 { ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_load_b32 v2, v0, s[0:1] scale_offset scope:SCOPE_SYS ; GCN-NEXT: s_wait_loadcnt 0x0 -; GCN-NEXT: s_wait_xcnt 0x0 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GCN-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/flat-load-saddr-to-vaddr.ll b/llvm/test/CodeGen/AMDGPU/flat-load-saddr-to-vaddr.ll index e8efa859ce13f..213233e802a96 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-load-saddr-to-vaddr.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-load-saddr-to-vaddr.ll @@ -27,7 +27,6 @@ define amdgpu_kernel void @test_move_load_address_to_vgpr(ptr addrspace(1) nocap ; GCN-NEXT: s_wait_dscnt 0x0 ; GCN-NEXT: flat_load_b32 v3, v[0:1] scope:SCOPE_SYS ; GCN-NEXT: s_wait_loadcnt 0x0 -; GCN-NEXT: s_wait_xcnt 0x0 ; GCN-NEXT: v_add_nc_u64_e32 v[0:1], 4, v[0:1] ; GCN-NEXT: v_add_co_u32 v2, s0, v2, 1 ; GCN-NEXT: s_and_b32 vcc_lo, exec_lo, s0 diff --git a/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll b/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll index 31344c78990b8..f5feeb2f49171 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll @@ -2107,7 +2107,7 @@ define amdgpu_ps void @flat_addr_64bit_lsr_iv(ptr inreg %arg) { ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], 0 ; GFX1250-SDAG-NEXT: .LBB116_1: ; %bb3 ; GFX1250-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: s_add_nc_u64 s[4:5], s[2:3], s[0:1] ; GFX1250-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], 4 ; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 @@ -2126,7 +2126,6 @@ define amdgpu_ps void @flat_addr_64bit_lsr_iv(ptr inreg %arg) { ; GFX1250-GISEL-NEXT: .LBB116_1: ; %bb3 ; GFX1250-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 -; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v4, vcc_lo, v0, v2 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v1, v3, vcc_lo @@ -2162,7 +2161,7 @@ define amdgpu_ps void @flat_addr_64bit_lsr_iv_multiload(ptr inreg %arg, ptr inre ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], 0 ; GFX1250-SDAG-NEXT: .LBB117_1: ; %bb3 ; GFX1250-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: s_add_nc_u64 s[4:5], s[2:3], s[0:1] ; GFX1250-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], 4 ; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 @@ -2183,7 +2182,6 @@ define amdgpu_ps void @flat_addr_64bit_lsr_iv_multiload(ptr inreg %arg, ptr inre ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX1250-GISEL-NEXT: .LBB117_1: ; %bb3 ; GFX1250-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v4, vcc_lo, v0, v2 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v1, v3, vcc_lo diff --git a/llvm/test/CodeGen/AMDGPU/fmin3.ll b/llvm/test/CodeGen/AMDGPU/fmin3.ll index 6a6f232c55e24..2756472652bc9 100644 --- a/llvm/test/CodeGen/AMDGPU/fmin3.ll +++ b/llvm/test/CodeGen/AMDGPU/fmin3.ll @@ -1233,7 +1233,6 @@ define amdgpu_kernel void @test_fmin3_olt_0_f64(ptr addrspace(1) %out, ptr addrs ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: buffer_load_b64 v[2:3], off, s[16:19], null scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_xcnt 0x1 ; GFX1250-NEXT: s_mov_b32 s4, s14 ; GFX1250-NEXT: s_mov_b32 s5, s15 ; GFX1250-NEXT: s_mov_b32 s0, s8 @@ -1443,7 +1442,6 @@ define amdgpu_kernel void @test_fmin3_olt_1_f64(ptr addrspace(1) %out, ptr addrs ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: buffer_load_b64 v[2:3], off, s[16:19], null scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_xcnt 0x1 ; GFX1250-NEXT: s_mov_b32 s4, s14 ; GFX1250-NEXT: s_mov_b32 s5, s15 ; GFX1250-NEXT: s_mov_b32 s0, s8 diff --git a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx942.ll b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx942.ll index ba761bedb905c..9e5a4428b011f 100644 --- a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx942.ll +++ b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx942.ll @@ -38,7 +38,6 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat(ptr %ptr) { ; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_atomic_add_f32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 @@ -80,7 +79,6 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat_ieee(ptr %ptr) #0 { ; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_atomic_add_f32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll index 6484c2f82ff94..831af7b6c10ba 100644 --- a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll @@ -1473,7 +1473,6 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -1516,7 +1515,6 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace( ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -1561,7 +1559,6 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -1604,7 +1601,6 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace( ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -1776,7 +1772,6 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrs ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -1821,7 +1816,6 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat(ptr %ptr) #1 { ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 @@ -1864,7 +1858,6 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent(ptr %ptr) #1 { ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 @@ -1909,7 +1902,6 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_system(ptr %ptr) #1 { ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -2083,7 +2075,6 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent_safe(ptr %ptr) { ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll index 1978e68fdae9c..867cc6e2f8d58 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll @@ -8814,7 +8814,7 @@ define half @global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX1250-TRUE16-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 ; GFX1250-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX1250-TRUE16-NEXT: s_cbranch_execnz .LBB44_1 ; GFX1250-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8857,7 +8857,7 @@ define half @global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 ; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX1250-FAKE16-NEXT: s_cbranch_execnz .LBB44_1 ; GFX1250-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end @@ -9322,7 +9322,7 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX1250-TRUE16-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 ; GFX1250-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX1250-TRUE16-NEXT: s_cbranch_execnz .LBB45_1 ; GFX1250-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end @@ -9365,7 +9365,7 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 ; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX1250-FAKE16-NEXT: s_cbranch_execnz .LBB45_1 ; GFX1250-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end @@ -9844,7 +9844,7 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX1250-TRUE16-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 ; GFX1250-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX1250-TRUE16-NEXT: s_cbranch_execnz .LBB46_1 ; GFX1250-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end @@ -9888,7 +9888,7 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 ; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX1250-FAKE16-NEXT: s_cbranch_execnz .LBB46_1 ; GFX1250-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end @@ -10365,7 +10365,6 @@ define void @global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-TRUE16-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5 -; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0 ; GFX1250-TRUE16-NEXT: v_mov_b32_e32 v5, v4 ; GFX1250-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX1250-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -10407,7 +10406,6 @@ define void @global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5 -; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0 ; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v5, v4 ; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX1250-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -10857,7 +10855,6 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-TRUE16-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5 -; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0 ; GFX1250-TRUE16-NEXT: v_mov_b32_e32 v5, v4 ; GFX1250-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX1250-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -10899,7 +10896,6 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5 -; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0 ; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v5, v4 ; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX1250-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -11363,7 +11359,6 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-TRUE16-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5 -; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0 ; GFX1250-TRUE16-NEXT: v_mov_b32_e32 v5, v4 ; GFX1250-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX1250-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -11406,7 +11401,6 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5 -; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0 ; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v5, v4 ; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX1250-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -11861,7 +11855,7 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX1250-TRUE16-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v5 ; GFX1250-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX1250-TRUE16-NEXT: s_cbranch_execnz .LBB50_1 ; GFX1250-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end @@ -11893,7 +11887,7 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v5 ; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX1250-FAKE16-NEXT: s_cbranch_execnz .LBB50_1 ; GFX1250-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end @@ -12245,7 +12239,6 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-TRUE16-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v5 -; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0 ; GFX1250-TRUE16-NEXT: v_mov_b32_e32 v5, v3 ; GFX1250-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX1250-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -12276,7 +12269,6 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v5 -; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0 ; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v5, v3 ; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX1250-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -12631,7 +12623,7 @@ define half @global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX1250-TRUE16-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 ; GFX1250-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX1250-TRUE16-NEXT: s_cbranch_execnz .LBB52_1 ; GFX1250-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end @@ -12674,7 +12666,7 @@ define half @global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 ; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX1250-FAKE16-NEXT: s_cbranch_execnz .LBB52_1 ; GFX1250-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end @@ -13154,7 +13146,6 @@ define void @global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-TRUE16-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5 -; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0 ; GFX1250-TRUE16-NEXT: v_mov_b32_e32 v5, v4 ; GFX1250-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX1250-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -13196,7 +13187,6 @@ define void @global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5 -; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0 ; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v5, v4 ; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX1250-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -13676,7 +13666,7 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX1250-TRUE16-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 ; GFX1250-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX1250-TRUE16-NEXT: s_cbranch_execnz .LBB54_1 ; GFX1250-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end @@ -13722,7 +13712,7 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 ; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX1250-FAKE16-NEXT: s_cbranch_execnz .LBB54_1 ; GFX1250-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end @@ -14273,7 +14263,7 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX1250-TRUE16-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 ; GFX1250-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX1250-TRUE16-NEXT: s_cbranch_execnz .LBB55_1 ; GFX1250-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end @@ -14319,7 +14309,7 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 ; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX1250-FAKE16-NEXT: s_cbranch_execnz .LBB55_1 ; GFX1250-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end @@ -14888,7 +14878,7 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX1250-TRUE16-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 ; GFX1250-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX1250-TRUE16-NEXT: s_cbranch_execnz .LBB56_1 ; GFX1250-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end @@ -14936,7 +14926,7 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 ; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX1250-FAKE16-NEXT: s_cbranch_execnz .LBB56_1 ; GFX1250-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15502,7 +15492,6 @@ define void @global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-TRUE16-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5 -; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0 ; GFX1250-TRUE16-NEXT: v_mov_b32_e32 v5, v4 ; GFX1250-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX1250-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -15547,7 +15536,6 @@ define void @global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5 -; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0 ; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v5, v4 ; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX1250-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -16081,7 +16069,6 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-TRUE16-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5 -; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0 ; GFX1250-TRUE16-NEXT: v_mov_b32_e32 v5, v4 ; GFX1250-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX1250-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -16126,7 +16113,6 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0 ; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v3, v2 ; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX1250-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -16678,7 +16664,6 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-TRUE16-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5 -; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0 ; GFX1250-TRUE16-NEXT: v_mov_b32_e32 v5, v4 ; GFX1250-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX1250-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -16725,7 +16710,6 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0 ; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v3, v2 ; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX1250-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -17269,7 +17253,7 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX1250-TRUE16-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v5 ; GFX1250-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX1250-TRUE16-NEXT: s_cbranch_execnz .LBB60_1 ; GFX1250-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end @@ -17305,7 +17289,7 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v5 ; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX1250-FAKE16-NEXT: s_cbranch_execnz .LBB60_1 ; GFX1250-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end @@ -17753,7 +17737,6 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-TRUE16-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v5 -; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0 ; GFX1250-TRUE16-NEXT: v_mov_b32_e32 v5, v3 ; GFX1250-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX1250-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -17788,7 +17771,6 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0 ; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v3, v2 ; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX1250-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -18238,7 +18220,7 @@ define bfloat @global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX1250-TRUE16-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 ; GFX1250-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX1250-TRUE16-NEXT: s_cbranch_execnz .LBB62_1 ; GFX1250-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end @@ -18284,7 +18266,7 @@ define bfloat @global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 ; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX1250-FAKE16-NEXT: s_cbranch_execnz .LBB62_1 ; GFX1250-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end @@ -18854,7 +18836,6 @@ define void @global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-TRUE16-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5 -; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0 ; GFX1250-TRUE16-NEXT: v_mov_b32_e32 v5, v4 ; GFX1250-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX1250-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -18899,7 +18880,6 @@ define void @global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0 ; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v3, v2 ; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX1250-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll index 3324018ca7237..5b36d4cefa2e3 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll @@ -66,7 +66,6 @@ define amdgpu_kernel void @is_private_vgpr(ptr addrspace(1) %ptr.ptr) { ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_load_b64 v[0:1], v0, s[0:1] scale_offset scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v1 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_cmp_gt_u32_e32 vcc_lo, 0x4000000, v0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll index 355d0026091d9..7aecae901becf 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll @@ -101,7 +101,6 @@ define amdgpu_kernel void @is_local_vgpr(ptr addrspace(1) %ptr.ptr) { ; GFX1250-NEXT: global_load_b64 v[0:1], v0, s[2:3] scale_offset scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, s1, v1 -; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX1250-NEXT: global_store_b32 v[0:1], v0, off ; GFX1250-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.tr.gfx1250.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.tr.gfx1250.w32.ll index d9f2fc55709a6..f7ed5341141d4 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.tr.gfx1250.w32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.tr.gfx1250.w32.ll @@ -330,6 +330,7 @@ define { i32, <3 x i32> } @global_load_tr6_b96_vaddr_no_align2_requirement(ptr a ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_load_tr6_b96 v[2:4], v[0:1], off offset:32 ; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, v2 ; GFX1250-NEXT: v_dual_mov_b32 v2, v3 :: v_dual_mov_b32 v3, v4 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] @@ -348,6 +349,7 @@ define { i32, <3 x i32> } @global_load_tr6_b96_saddr_no_align2_requirement(ptr a ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: global_load_tr6_b96 v[2:4], v0, s[0:1] offset:32 ; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, v2 ; GFX1250-NEXT: v_dual_mov_b32 v2, v3 :: v_dual_mov_b32 v3, v4 ; GFX1250-NEXT: s_set_pc_i64 s[30:31]