diff --git a/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp b/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp index 5720b978aada0..80cca7bcfde9c 100644 --- a/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp +++ b/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp @@ -17,6 +17,7 @@ #include "GCNSubtarget.h" #include "llvm/ADT/SmallSet.h" #include "llvm/CodeGen/MachineFunctionPass.h" +#include using namespace llvm; @@ -50,6 +51,7 @@ class SIPostRABundler { bool run(MachineFunction &MF); private: + const SIInstrInfo *TII = nullptr; const SIRegisterInfo *TRI; SmallSet Defs; @@ -60,6 +62,9 @@ class SIPostRABundler { bool isBundleCandidate(const MachineInstr &MI) const; bool isDependentLoad(const MachineInstr &MI) const; bool canBundle(const MachineInstr &MI, const MachineInstr &NextMI) const; + void reorderLoads(MachineBasicBlock &MBB, + MachineBasicBlock::instr_iterator &BundleStart, + MachineBasicBlock::instr_iterator Next); }; constexpr uint64_t MemFlags = SIInstrFlags::MTBUF | SIInstrFlags::MUBUF | @@ -129,6 +134,141 @@ bool SIPostRABundler::canBundle(const MachineInstr &MI, !isDependentLoad(NextMI)); } +static Register getDef(MachineInstr &MI) { + assert(MI.getNumExplicitDefs() > 0); + return MI.defs().begin()->getReg(); +} + +void SIPostRABundler::reorderLoads( + MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator &BundleStart, + MachineBasicBlock::instr_iterator Next) { + // Don't reorder ALU, store or scalar clauses. + if (!BundleStart->mayLoad() || BundleStart->mayStore() || + SIInstrInfo::isSMRD(*BundleStart) || !BundleStart->getNumExplicitDefs()) + return; + + // Search to find the usage distance of each defined register in the clause. + const unsigned SearchDistance = std::max(Defs.size(), 100UL); + SmallDenseMap UseDistance; + unsigned MaxDistance = 0; + for (MachineBasicBlock::iterator SearchI = Next; + SearchI != MBB.end() && MaxDistance < SearchDistance && + UseDistance.size() < Defs.size(); + ++SearchI, ++MaxDistance) { + for (Register Reg : Defs) { + if (UseDistance.contains(Reg)) + continue; + if (SearchI->readsRegister(Reg, TRI)) + UseDistance[Reg] = MaxDistance; + } + } + + if (UseDistance.empty()) + return; + + LLVM_DEBUG(dbgs() << "Try bundle reordering\n"); + + // Build schedule based on use distance of register uses. + // Attempt to preserve exist order (NativeOrder) where possible. + std::deque> Schedule; + unsigned NativeOrder = 0, LastOrder = 0; + bool Reordered = false; + for (auto II = BundleStart; II != Next; ++II, ++NativeOrder) { + // Bail out if we encounter anything that seems risky to reorder. + if (!II->getNumExplicitDefs() || II->isKill() || + llvm::any_of(II->memoperands(), [&](const MachineMemOperand *MMO) { + return MMO->isAtomic() || MMO->isVolatile(); + })) { + LLVM_DEBUG(dbgs() << " Abort\n"); + return; + } + + Register Reg = getDef(*II); + unsigned NewOrder = + UseDistance.contains(Reg) ? UseDistance[Reg] : MaxDistance; + LLVM_DEBUG(dbgs() << " Order: " << NewOrder << "," << NativeOrder + << ", MI: " << *II); + unsigned Order = (NewOrder << 16 | NativeOrder); + Schedule.emplace_back(&*II, Order); + Reordered |= Order < LastOrder; + LastOrder = Order; + } + + // No reordering found. + if (!Reordered) { + LLVM_DEBUG(dbgs() << " No changes\n"); + return; + } + + // Apply sort on new ordering. + std::sort(Schedule.begin(), Schedule.end(), + [](std::pair A, + std::pair B) { + return A.second < B.second; + }); + + // Rebuild clause order. + // Schedule holds ideal order for the load operations; however, each def + // can only be scheduled when it will no longer clobber any uses. + SmallVector Clause; + while (!Schedule.empty()) { + // Try to schedule next instruction in schedule. + // Iterate until we find something that can be placed. + auto It = Schedule.begin(); + while (It != Schedule.end()) { + MachineInstr *MI = It->first; + LLVM_DEBUG(dbgs() << "Try schedule: " << *MI); + + if (MI->getNumExplicitDefs() == 0) { + // No defs, always schedule. + LLVM_DEBUG(dbgs() << " Trivially OK\n"); + break; + } + + Register DefReg = getDef(*MI); + bool DefRegHasUse = false; + for (auto SearchIt = std::next(It); + SearchIt != Schedule.end() && !DefRegHasUse; ++SearchIt) + DefRegHasUse = SearchIt->first->readsRegister(DefReg, TRI); + if (DefRegHasUse) { + // A future use would be clobbered; try next instruction in the + // schedule. + LLVM_DEBUG(dbgs() << " Clobbers uses\n"); + It++; + continue; + } + + // Safe to schedule. + LLVM_DEBUG(dbgs() << " OK!\n"); + break; + } + + // Place schedule instruction into clause order. + assert(It != Schedule.end()); + MachineInstr *MI = It->first; + Schedule.erase(It); + Clause.push_back(MI); + + // Clear kill flags for later uses. + for (auto &Use : MI->all_uses()) { + if (!Use.isReg() || !Use.isKill()) + continue; + Register UseReg = Use.getReg(); + if (llvm::any_of(Schedule, [&](std::pair &SI) { + return SI.first->readsRegister(UseReg, TRI); + })) + Use.setIsKill(false); + } + } + + // Apply order to instructions. + for (MachineInstr *MI : Clause) + MI->moveBefore(&*Next); + + // Update start of bundle. + BundleStart = Clause[0]->getIterator(); +} + bool SIPostRABundlerLegacy::runOnMachineFunction(MachineFunction &MF) { if (skipFunction(MF.getFunction())) return false; @@ -143,6 +283,8 @@ PreservedAnalyses SIPostRABundlerPass::run(MachineFunction &MF, bool SIPostRABundler::run(MachineFunction &MF) { + const GCNSubtarget &ST = MF.getSubtarget(); + TII = ST.getInstrInfo(); TRI = MF.getSubtarget().getRegisterInfo(); BitVector BundleUsedRegUnits(TRI->getNumRegUnits()); BitVector KillUsedRegUnits(TRI->getNumRegUnits()); @@ -170,7 +312,7 @@ bool SIPostRABundler::run(MachineFunction &MF) { assert(Defs.empty()); if (I->getNumExplicitDefs() != 0) - Defs.insert(I->defs().begin()->getReg()); + Defs.insert(getDef(*I)); MachineBasicBlock::instr_iterator BundleStart = I; MachineBasicBlock::instr_iterator BundleEnd = I; @@ -182,7 +324,7 @@ bool SIPostRABundler::run(MachineFunction &MF) { if (canBundle(*BundleEnd, *I)) { BundleEnd = I; if (I->getNumExplicitDefs() != 0) - Defs.insert(I->defs().begin()->getReg()); + Defs.insert(getDef(*I)); ++ClauseLength; } else if (!I->isMetaInstruction() || I->getOpcode() == AMDGPU::SCHED_BARRIER) { @@ -234,6 +376,7 @@ bool SIPostRABundler::run(MachineFunction &MF) { BundleUsedRegUnits.reset(); } + reorderLoads(MBB, BundleStart, Next); finalizeBundle(MBB, BundleStart, Next); } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.vni16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.vni16.ll index b67080bd4798d..c04f86391c44b 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.vni16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.vni16.ll @@ -716,17 +716,17 @@ define void @add_v11i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addr ; GFX9-LABEL: add_v11i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx4 v[6:9], v[0:1], off ; GFX9-NEXT: global_load_ushort v14, v[0:1], off offset:16 ; GFX9-NEXT: global_load_ushort v15, v[2:3], off offset:16 +; GFX9-NEXT: global_load_dwordx4 v[6:9], v[0:1], off ; GFX9-NEXT: global_load_dwordx4 v[10:13], v[2:3], off ; GFX9-NEXT: global_load_ushort v16, v[2:3], off offset:20 ; GFX9-NEXT: global_load_ushort v17, v[0:1], off offset:20 ; GFX9-NEXT: global_load_ushort v18, v[0:1], off offset:18 ; GFX9-NEXT: global_load_ushort v19, v[2:3], off offset:18 -; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v15 ; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_pk_add_u16 v0, v6, v10 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-fma-mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-fma-mul.ll index 6ea0a9446ff9d..7fca4d628d023 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-fma-mul.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-fma-mul.ll @@ -750,20 +750,20 @@ define <4 x double> @test_f64_add_mul(<4 x double> %a, <4 x double> %b, <4 x dou ; GFX10-CONTRACT: ; %bb.0: ; %.entry ; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-CONTRACT-NEXT: s_clause 0x8 -; GFX10-CONTRACT-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX10-CONTRACT-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; GFX10-CONTRACT-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; GFX10-CONTRACT-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12 ; GFX10-CONTRACT-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:16 ; GFX10-CONTRACT-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:20 ; GFX10-CONTRACT-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:24 +; GFX10-CONTRACT-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX10-CONTRACT-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:28 ; GFX10-CONTRACT-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:32 -; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(6) +; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(7) ; GFX10-CONTRACT-NEXT: v_fma_f64 v[16:17], v[16:17], v[24:25], v[32:33] -; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(4) +; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(5) ; GFX10-CONTRACT-NEXT: v_fma_f64 v[18:19], v[18:19], v[26:27], v[34:35] -; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(2) +; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(3) ; GFX10-CONTRACT-NEXT: v_fma_f64 v[20:21], v[20:21], v[28:29], v[36:37] ; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(0) ; GFX10-CONTRACT-NEXT: v_fma_f64 v[22:23], v[22:23], v[30:31], v[38:39] @@ -777,20 +777,20 @@ define <4 x double> @test_f64_add_mul(<4 x double> %a, <4 x double> %b, <4 x dou ; GFX10-DENORM: ; %bb.0: ; %.entry ; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-DENORM-NEXT: s_clause 0x8 -; GFX10-DENORM-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX10-DENORM-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; GFX10-DENORM-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; GFX10-DENORM-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12 ; GFX10-DENORM-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:16 ; GFX10-DENORM-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:20 ; GFX10-DENORM-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:24 +; GFX10-DENORM-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX10-DENORM-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:28 ; GFX10-DENORM-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:32 -; GFX10-DENORM-NEXT: s_waitcnt vmcnt(6) +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(7) ; GFX10-DENORM-NEXT: v_fma_f64 v[16:17], v[16:17], v[24:25], v[32:33] -; GFX10-DENORM-NEXT: s_waitcnt vmcnt(4) +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(5) ; GFX10-DENORM-NEXT: v_fma_f64 v[18:19], v[18:19], v[26:27], v[34:35] -; GFX10-DENORM-NEXT: s_waitcnt vmcnt(2) +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(3) ; GFX10-DENORM-NEXT: v_fma_f64 v[20:21], v[20:21], v[28:29], v[36:37] ; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) ; GFX10-DENORM-NEXT: v_fma_f64 v[22:23], v[22:23], v[30:31], v[38:39] @@ -804,20 +804,20 @@ define <4 x double> @test_f64_add_mul(<4 x double> %a, <4 x double> %b, <4 x dou ; GFX11-CONTRACT: ; %bb.0: ; %.entry ; GFX11-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-CONTRACT-NEXT: s_clause 0x8 -; GFX11-CONTRACT-NEXT: scratch_load_b32 v31, off, s32 ; GFX11-CONTRACT-NEXT: scratch_load_b32 v32, off, s32 offset:4 ; GFX11-CONTRACT-NEXT: scratch_load_b32 v33, off, s32 offset:8 ; GFX11-CONTRACT-NEXT: scratch_load_b32 v34, off, s32 offset:12 ; GFX11-CONTRACT-NEXT: scratch_load_b32 v35, off, s32 offset:16 ; GFX11-CONTRACT-NEXT: scratch_load_b32 v36, off, s32 offset:20 ; GFX11-CONTRACT-NEXT: scratch_load_b32 v37, off, s32 offset:24 +; GFX11-CONTRACT-NEXT: scratch_load_b32 v31, off, s32 ; GFX11-CONTRACT-NEXT: scratch_load_b32 v38, off, s32 offset:28 ; GFX11-CONTRACT-NEXT: scratch_load_b32 v39, off, s32 offset:32 -; GFX11-CONTRACT-NEXT: s_waitcnt vmcnt(6) +; GFX11-CONTRACT-NEXT: s_waitcnt vmcnt(7) ; GFX11-CONTRACT-NEXT: v_fma_f64 v[16:17], v[16:17], v[24:25], v[32:33] -; GFX11-CONTRACT-NEXT: s_waitcnt vmcnt(4) +; GFX11-CONTRACT-NEXT: s_waitcnt vmcnt(5) ; GFX11-CONTRACT-NEXT: v_fma_f64 v[18:19], v[18:19], v[26:27], v[34:35] -; GFX11-CONTRACT-NEXT: s_waitcnt vmcnt(2) +; GFX11-CONTRACT-NEXT: s_waitcnt vmcnt(3) ; GFX11-CONTRACT-NEXT: v_fma_f64 v[20:21], v[20:21], v[28:29], v[36:37] ; GFX11-CONTRACT-NEXT: s_waitcnt vmcnt(0) ; GFX11-CONTRACT-NEXT: v_fma_f64 v[22:23], v[22:23], v[30:31], v[38:39] @@ -833,20 +833,20 @@ define <4 x double> @test_f64_add_mul(<4 x double> %a, <4 x double> %b, <4 x dou ; GFX11-DENORM: ; %bb.0: ; %.entry ; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-DENORM-NEXT: s_clause 0x8 -; GFX11-DENORM-NEXT: scratch_load_b32 v31, off, s32 ; GFX11-DENORM-NEXT: scratch_load_b32 v32, off, s32 offset:4 ; GFX11-DENORM-NEXT: scratch_load_b32 v33, off, s32 offset:8 ; GFX11-DENORM-NEXT: scratch_load_b32 v34, off, s32 offset:12 ; GFX11-DENORM-NEXT: scratch_load_b32 v35, off, s32 offset:16 ; GFX11-DENORM-NEXT: scratch_load_b32 v36, off, s32 offset:20 ; GFX11-DENORM-NEXT: scratch_load_b32 v37, off, s32 offset:24 +; GFX11-DENORM-NEXT: scratch_load_b32 v31, off, s32 ; GFX11-DENORM-NEXT: scratch_load_b32 v38, off, s32 offset:28 ; GFX11-DENORM-NEXT: scratch_load_b32 v39, off, s32 offset:32 -; GFX11-DENORM-NEXT: s_waitcnt vmcnt(6) +; GFX11-DENORM-NEXT: s_waitcnt vmcnt(7) ; GFX11-DENORM-NEXT: v_fma_f64 v[16:17], v[16:17], v[24:25], v[32:33] -; GFX11-DENORM-NEXT: s_waitcnt vmcnt(4) +; GFX11-DENORM-NEXT: s_waitcnt vmcnt(5) ; GFX11-DENORM-NEXT: v_fma_f64 v[18:19], v[18:19], v[26:27], v[34:35] -; GFX11-DENORM-NEXT: s_waitcnt vmcnt(2) +; GFX11-DENORM-NEXT: s_waitcnt vmcnt(3) ; GFX11-DENORM-NEXT: v_fma_f64 v[20:21], v[20:21], v[28:29], v[36:37] ; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0) ; GFX11-DENORM-NEXT: v_fma_f64 v[22:23], v[22:23], v[30:31], v[38:39] @@ -921,20 +921,20 @@ define <4 x double> @test_f64_add_mul_rhs(<4 x double> %a, <4 x double> %b, <4 x ; GFX10-CONTRACT: ; %bb.0: ; %.entry ; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-CONTRACT-NEXT: s_clause 0x8 -; GFX10-CONTRACT-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX10-CONTRACT-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; GFX10-CONTRACT-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; GFX10-CONTRACT-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12 ; GFX10-CONTRACT-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:16 ; GFX10-CONTRACT-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:20 ; GFX10-CONTRACT-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:24 +; GFX10-CONTRACT-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX10-CONTRACT-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:28 ; GFX10-CONTRACT-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:32 -; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(6) +; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(7) ; GFX10-CONTRACT-NEXT: v_fma_f64 v[16:17], v[16:17], v[24:25], v[32:33] -; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(4) +; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(5) ; GFX10-CONTRACT-NEXT: v_fma_f64 v[18:19], v[18:19], v[26:27], v[34:35] -; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(2) +; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(3) ; GFX10-CONTRACT-NEXT: v_fma_f64 v[20:21], v[20:21], v[28:29], v[36:37] ; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(0) ; GFX10-CONTRACT-NEXT: v_fma_f64 v[22:23], v[22:23], v[30:31], v[38:39] @@ -948,20 +948,20 @@ define <4 x double> @test_f64_add_mul_rhs(<4 x double> %a, <4 x double> %b, <4 x ; GFX10-DENORM: ; %bb.0: ; %.entry ; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-DENORM-NEXT: s_clause 0x8 -; GFX10-DENORM-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX10-DENORM-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; GFX10-DENORM-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; GFX10-DENORM-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12 ; GFX10-DENORM-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:16 ; GFX10-DENORM-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:20 ; GFX10-DENORM-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:24 +; GFX10-DENORM-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX10-DENORM-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:28 ; GFX10-DENORM-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:32 -; GFX10-DENORM-NEXT: s_waitcnt vmcnt(6) +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(7) ; GFX10-DENORM-NEXT: v_fma_f64 v[16:17], v[16:17], v[24:25], v[32:33] -; GFX10-DENORM-NEXT: s_waitcnt vmcnt(4) +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(5) ; GFX10-DENORM-NEXT: v_fma_f64 v[18:19], v[18:19], v[26:27], v[34:35] -; GFX10-DENORM-NEXT: s_waitcnt vmcnt(2) +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(3) ; GFX10-DENORM-NEXT: v_fma_f64 v[20:21], v[20:21], v[28:29], v[36:37] ; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) ; GFX10-DENORM-NEXT: v_fma_f64 v[22:23], v[22:23], v[30:31], v[38:39] @@ -975,20 +975,20 @@ define <4 x double> @test_f64_add_mul_rhs(<4 x double> %a, <4 x double> %b, <4 x ; GFX11-CONTRACT: ; %bb.0: ; %.entry ; GFX11-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-CONTRACT-NEXT: s_clause 0x8 -; GFX11-CONTRACT-NEXT: scratch_load_b32 v31, off, s32 ; GFX11-CONTRACT-NEXT: scratch_load_b32 v32, off, s32 offset:4 ; GFX11-CONTRACT-NEXT: scratch_load_b32 v33, off, s32 offset:8 ; GFX11-CONTRACT-NEXT: scratch_load_b32 v34, off, s32 offset:12 ; GFX11-CONTRACT-NEXT: scratch_load_b32 v35, off, s32 offset:16 ; GFX11-CONTRACT-NEXT: scratch_load_b32 v36, off, s32 offset:20 ; GFX11-CONTRACT-NEXT: scratch_load_b32 v37, off, s32 offset:24 +; GFX11-CONTRACT-NEXT: scratch_load_b32 v31, off, s32 ; GFX11-CONTRACT-NEXT: scratch_load_b32 v38, off, s32 offset:28 ; GFX11-CONTRACT-NEXT: scratch_load_b32 v39, off, s32 offset:32 -; GFX11-CONTRACT-NEXT: s_waitcnt vmcnt(6) +; GFX11-CONTRACT-NEXT: s_waitcnt vmcnt(7) ; GFX11-CONTRACT-NEXT: v_fma_f64 v[16:17], v[16:17], v[24:25], v[32:33] -; GFX11-CONTRACT-NEXT: s_waitcnt vmcnt(4) +; GFX11-CONTRACT-NEXT: s_waitcnt vmcnt(5) ; GFX11-CONTRACT-NEXT: v_fma_f64 v[18:19], v[18:19], v[26:27], v[34:35] -; GFX11-CONTRACT-NEXT: s_waitcnt vmcnt(2) +; GFX11-CONTRACT-NEXT: s_waitcnt vmcnt(3) ; GFX11-CONTRACT-NEXT: v_fma_f64 v[20:21], v[20:21], v[28:29], v[36:37] ; GFX11-CONTRACT-NEXT: s_waitcnt vmcnt(0) ; GFX11-CONTRACT-NEXT: v_fma_f64 v[22:23], v[22:23], v[30:31], v[38:39] @@ -1004,20 +1004,20 @@ define <4 x double> @test_f64_add_mul_rhs(<4 x double> %a, <4 x double> %b, <4 x ; GFX11-DENORM: ; %bb.0: ; %.entry ; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-DENORM-NEXT: s_clause 0x8 -; GFX11-DENORM-NEXT: scratch_load_b32 v31, off, s32 ; GFX11-DENORM-NEXT: scratch_load_b32 v32, off, s32 offset:4 ; GFX11-DENORM-NEXT: scratch_load_b32 v33, off, s32 offset:8 ; GFX11-DENORM-NEXT: scratch_load_b32 v34, off, s32 offset:12 ; GFX11-DENORM-NEXT: scratch_load_b32 v35, off, s32 offset:16 ; GFX11-DENORM-NEXT: scratch_load_b32 v36, off, s32 offset:20 ; GFX11-DENORM-NEXT: scratch_load_b32 v37, off, s32 offset:24 +; GFX11-DENORM-NEXT: scratch_load_b32 v31, off, s32 ; GFX11-DENORM-NEXT: scratch_load_b32 v38, off, s32 offset:28 ; GFX11-DENORM-NEXT: scratch_load_b32 v39, off, s32 offset:32 -; GFX11-DENORM-NEXT: s_waitcnt vmcnt(6) +; GFX11-DENORM-NEXT: s_waitcnt vmcnt(7) ; GFX11-DENORM-NEXT: v_fma_f64 v[16:17], v[16:17], v[24:25], v[32:33] -; GFX11-DENORM-NEXT: s_waitcnt vmcnt(4) +; GFX11-DENORM-NEXT: s_waitcnt vmcnt(5) ; GFX11-DENORM-NEXT: v_fma_f64 v[18:19], v[18:19], v[26:27], v[34:35] -; GFX11-DENORM-NEXT: s_waitcnt vmcnt(2) +; GFX11-DENORM-NEXT: s_waitcnt vmcnt(3) ; GFX11-DENORM-NEXT: v_fma_f64 v[20:21], v[20:21], v[28:29], v[36:37] ; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0) ; GFX11-DENORM-NEXT: v_fma_f64 v[22:23], v[22:23], v[30:31], v[38:39] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll index 4ed1cb2d1260e..745beaaf43330 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll @@ -69,6 +69,14 @@ define <4 x float> @waterfall_loop(<8 x i32> %vgpr_srd) { ; CHECK-NEXT: s_or_saveexec_b32 s21, -1 ; CHECK-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b32 exec_lo, s21 +; CHECK-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; CHECK-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; CHECK-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; CHECK-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload @@ -77,22 +85,21 @@ define <4 x float> @waterfall_loop(<8 x i32> %vgpr_srd) { ; CHECK-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload ; CHECK-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload ; CHECK-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: s_waitcnt vmcnt(15) ; CHECK-NEXT: v_readfirstlane_b32 s12, v7 +; CHECK-NEXT: s_waitcnt vmcnt(14) ; CHECK-NEXT: v_readfirstlane_b32 s10, v6 +; CHECK-NEXT: s_waitcnt vmcnt(13) ; CHECK-NEXT: v_readfirstlane_b32 s9, v5 +; CHECK-NEXT: s_waitcnt vmcnt(12) ; CHECK-NEXT: v_readfirstlane_b32 s8, v4 +; CHECK-NEXT: s_waitcnt vmcnt(11) ; CHECK-NEXT: v_readfirstlane_b32 s7, v3 +; CHECK-NEXT: s_waitcnt vmcnt(10) ; CHECK-NEXT: v_readfirstlane_b32 s6, v2 +; CHECK-NEXT: s_waitcnt vmcnt(9) ; CHECK-NEXT: v_readfirstlane_b32 s5, v1 +; CHECK-NEXT: s_waitcnt vmcnt(8) ; CHECK-NEXT: v_readfirstlane_b32 s4, v0 ; CHECK-NEXT: ; kill: def $sgpr12 killed $sgpr12 def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 ; CHECK-NEXT: s_mov_b32 s13, s10 @@ -110,12 +117,16 @@ define <4 x float> @waterfall_loop(<8 x i32> %vgpr_srd) { ; CHECK-NEXT: v_writelane_b32 v16, s17, 10 ; CHECK-NEXT: v_writelane_b32 v16, s18, 11 ; CHECK-NEXT: v_writelane_b32 v16, s19, 12 +; CHECK-NEXT: s_waitcnt vmcnt(6) ; CHECK-NEXT: v_mov_b32_e32 v6, v8 ; CHECK-NEXT: v_mov_b32_e32 v7, v9 +; CHECK-NEXT: s_waitcnt vmcnt(4) ; CHECK-NEXT: v_mov_b32_e32 v4, v10 ; CHECK-NEXT: v_mov_b32_e32 v5, v11 +; CHECK-NEXT: s_waitcnt vmcnt(2) ; CHECK-NEXT: v_mov_b32_e32 v2, v12 ; CHECK-NEXT: v_mov_b32_e32 v3, v13 +; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v0, v14 ; CHECK-NEXT: v_mov_b32_e32 v1, v15 ; CHECK-NEXT: s_mov_b64 s[4:5], s[12:13] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.ll index e0016b0a5a64d..21d1b04e1aeee 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.ll @@ -16,41 +16,41 @@ define amdgpu_cs void @memcpy_p1i8(ptr addrspace(1) %dst, ptr addrspace(1) %src) ; LOOP-NEXT: ; =>This Inner Loop Header: Depth=1 ; LOOP-NEXT: v_add_i32_e32 v6, vcc, v2, v4 ; LOOP-NEXT: v_addc_u32_e32 v7, vcc, v3, v5, vcc -; LOOP-NEXT: buffer_load_ubyte v26, v[6:7], s[0:3], 0 addr64 ; LOOP-NEXT: s_waitcnt expcnt(5) ; LOOP-NEXT: buffer_load_ubyte v29, v[6:7], s[0:3], 0 addr64 offset:1 +; LOOP-NEXT: buffer_load_ubyte v26, v[6:7], s[0:3], 0 addr64 +; LOOP-NEXT: buffer_load_ubyte v32, v[6:7], s[0:3], 0 addr64 offset:3 ; LOOP-NEXT: s_waitcnt expcnt(2) ; LOOP-NEXT: buffer_load_ubyte v31, v[6:7], s[0:3], 0 addr64 offset:2 -; LOOP-NEXT: buffer_load_ubyte v32, v[6:7], s[0:3], 0 addr64 offset:3 -; LOOP-NEXT: buffer_load_ubyte v36, v[6:7], s[0:3], 0 addr64 offset:4 ; LOOP-NEXT: buffer_load_ubyte v37, v[6:7], s[0:3], 0 addr64 offset:5 -; LOOP-NEXT: buffer_load_ubyte v38, v[6:7], s[0:3], 0 addr64 offset:6 ; LOOP-NEXT: buffer_load_ubyte v39, v[6:7], s[0:3], 0 addr64 offset:7 -; LOOP-NEXT: buffer_load_ubyte v8, v[6:7], s[0:3], 0 addr64 offset:8 +; LOOP-NEXT: buffer_load_ubyte v38, v[6:7], s[0:3], 0 addr64 offset:6 +; LOOP-NEXT: buffer_load_ubyte v36, v[6:7], s[0:3], 0 addr64 offset:4 ; LOOP-NEXT: buffer_load_ubyte v11, v[6:7], s[0:3], 0 addr64 offset:9 -; LOOP-NEXT: buffer_load_ubyte v12, v[6:7], s[0:3], 0 addr64 offset:10 ; LOOP-NEXT: s_waitcnt expcnt(0) ; LOOP-NEXT: buffer_load_ubyte v13, v[6:7], s[0:3], 0 addr64 offset:11 -; LOOP-NEXT: buffer_load_ubyte v9, v[6:7], s[0:3], 0 addr64 offset:12 +; LOOP-NEXT: buffer_load_ubyte v12, v[6:7], s[0:3], 0 addr64 offset:10 ; LOOP-NEXT: buffer_load_ubyte v15, v[6:7], s[0:3], 0 addr64 offset:13 -; LOOP-NEXT: buffer_load_ubyte v16, v[6:7], s[0:3], 0 addr64 offset:14 ; LOOP-NEXT: buffer_load_ubyte v17, v[6:7], s[0:3], 0 addr64 offset:15 -; LOOP-NEXT: buffer_load_ubyte v10, v[6:7], s[0:3], 0 addr64 offset:16 +; LOOP-NEXT: buffer_load_ubyte v16, v[6:7], s[0:3], 0 addr64 offset:14 ; LOOP-NEXT: buffer_load_ubyte v19, v[6:7], s[0:3], 0 addr64 offset:17 -; LOOP-NEXT: buffer_load_ubyte v20, v[6:7], s[0:3], 0 addr64 offset:18 ; LOOP-NEXT: buffer_load_ubyte v21, v[6:7], s[0:3], 0 addr64 offset:19 -; LOOP-NEXT: buffer_load_ubyte v14, v[6:7], s[0:3], 0 addr64 offset:20 +; LOOP-NEXT: buffer_load_ubyte v20, v[6:7], s[0:3], 0 addr64 offset:18 ; LOOP-NEXT: buffer_load_ubyte v23, v[6:7], s[0:3], 0 addr64 offset:21 -; LOOP-NEXT: buffer_load_ubyte v24, v[6:7], s[0:3], 0 addr64 offset:22 ; LOOP-NEXT: buffer_load_ubyte v25, v[6:7], s[0:3], 0 addr64 offset:23 -; LOOP-NEXT: buffer_load_ubyte v18, v[6:7], s[0:3], 0 addr64 offset:24 +; LOOP-NEXT: buffer_load_ubyte v24, v[6:7], s[0:3], 0 addr64 offset:22 ; LOOP-NEXT: buffer_load_ubyte v27, v[6:7], s[0:3], 0 addr64 offset:25 -; LOOP-NEXT: buffer_load_ubyte v28, v[6:7], s[0:3], 0 addr64 offset:26 ; LOOP-NEXT: buffer_load_ubyte v30, v[6:7], s[0:3], 0 addr64 offset:27 -; LOOP-NEXT: buffer_load_ubyte v22, v[6:7], s[0:3], 0 addr64 offset:28 +; LOOP-NEXT: buffer_load_ubyte v28, v[6:7], s[0:3], 0 addr64 offset:26 ; LOOP-NEXT: buffer_load_ubyte v33, v[6:7], s[0:3], 0 addr64 offset:29 -; LOOP-NEXT: buffer_load_ubyte v34, v[6:7], s[0:3], 0 addr64 offset:30 ; LOOP-NEXT: buffer_load_ubyte v35, v[6:7], s[0:3], 0 addr64 offset:31 +; LOOP-NEXT: buffer_load_ubyte v34, v[6:7], s[0:3], 0 addr64 offset:30 +; LOOP-NEXT: buffer_load_ubyte v8, v[6:7], s[0:3], 0 addr64 offset:8 +; LOOP-NEXT: buffer_load_ubyte v9, v[6:7], s[0:3], 0 addr64 offset:12 +; LOOP-NEXT: buffer_load_ubyte v10, v[6:7], s[0:3], 0 addr64 offset:16 +; LOOP-NEXT: buffer_load_ubyte v14, v[6:7], s[0:3], 0 addr64 offset:20 +; LOOP-NEXT: buffer_load_ubyte v18, v[6:7], s[0:3], 0 addr64 offset:24 +; LOOP-NEXT: buffer_load_ubyte v22, v[6:7], s[0:3], 0 addr64 offset:28 ; LOOP-NEXT: s_waitcnt vmcnt(14) ; LOOP-NEXT: v_lshlrev_b32_e32 v6, 8, v29 ; LOOP-NEXT: v_or_b32_e32 v26, v6, v26 @@ -74,34 +74,41 @@ define amdgpu_cs void @memcpy_p1i8(ptr addrspace(1) %dst, ptr addrspace(1) %src) ; LOOP-NEXT: v_lshlrev_b32_e32 v17, 24, v17 ; LOOP-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; LOOP-NEXT: v_lshlrev_b32_e32 v19, 8, v19 -; LOOP-NEXT: s_waitcnt vmcnt(12) ; LOOP-NEXT: v_lshlrev_b32_e32 v21, 24, v21 ; LOOP-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; LOOP-NEXT: s_waitcnt vmcnt(10) ; LOOP-NEXT: v_lshlrev_b32_e32 v23, 8, v23 -; LOOP-NEXT: s_waitcnt vmcnt(8) +; LOOP-NEXT: s_waitcnt vmcnt(13) ; LOOP-NEXT: v_lshlrev_b32_e32 v25, 24, v25 +; LOOP-NEXT: s_waitcnt vmcnt(12) ; LOOP-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; LOOP-NEXT: s_waitcnt vmcnt(6) +; LOOP-NEXT: s_waitcnt vmcnt(11) ; LOOP-NEXT: v_lshlrev_b32_e32 v27, 8, v27 -; LOOP-NEXT: s_waitcnt vmcnt(4) +; LOOP-NEXT: s_waitcnt vmcnt(10) ; LOOP-NEXT: v_lshlrev_b32_e32 v30, 24, v30 +; LOOP-NEXT: s_waitcnt vmcnt(9) ; LOOP-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; LOOP-NEXT: s_waitcnt vmcnt(2) +; LOOP-NEXT: s_waitcnt vmcnt(8) ; LOOP-NEXT: v_lshlrev_b32_e32 v33, 8, v33 -; LOOP-NEXT: s_waitcnt vmcnt(0) +; LOOP-NEXT: s_waitcnt vmcnt(7) ; LOOP-NEXT: v_lshlrev_b32_e32 v35, 24, v35 +; LOOP-NEXT: s_waitcnt vmcnt(6) ; LOOP-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; LOOP-NEXT: s_waitcnt vmcnt(5) ; LOOP-NEXT: v_or_b32_e32 v8, v11, v8 ; LOOP-NEXT: v_or_b32_e32 v11, v13, v12 +; LOOP-NEXT: s_waitcnt vmcnt(4) ; LOOP-NEXT: v_or_b32_e32 v9, v15, v9 ; LOOP-NEXT: v_or_b32_e32 v12, v17, v16 +; LOOP-NEXT: s_waitcnt vmcnt(3) ; LOOP-NEXT: v_or_b32_e32 v10, v19, v10 ; LOOP-NEXT: v_or_b32_e32 v13, v21, v20 +; LOOP-NEXT: s_waitcnt vmcnt(2) ; LOOP-NEXT: v_or_b32_e32 v14, v23, v14 ; LOOP-NEXT: v_or_b32_e32 v15, v25, v24 +; LOOP-NEXT: s_waitcnt vmcnt(1) ; LOOP-NEXT: v_or_b32_e32 v16, v27, v18 ; LOOP-NEXT: v_or_b32_e32 v17, v30, v28 +; LOOP-NEXT: s_waitcnt vmcnt(0) ; LOOP-NEXT: v_or_b32_e32 v18, v33, v22 ; LOOP-NEXT: v_or_b32_e32 v19, v35, v34 ; LOOP-NEXT: v_or_b32_e32 v20, v29, v26 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll index efa51ead0d196..232b738d1ad71 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll @@ -32,12 +32,12 @@ define <3 x i32> @v_load_constant_v3i32_align1(ptr addrspace(4) %ptr) { ; GFX12-NOUNALIGNED-NEXT: s_clause 0xb ; GFX12-NOUNALIGNED-NEXT: global_load_u8 v2, v[0:1], off ; GFX12-NOUNALIGNED-NEXT: global_load_u8 v3, v[0:1], off offset:1 -; GFX12-NOUNALIGNED-NEXT: global_load_u8 v4, v[0:1], off offset:2 ; GFX12-NOUNALIGNED-NEXT: global_load_u8 v5, v[0:1], off offset:3 +; GFX12-NOUNALIGNED-NEXT: global_load_u8 v4, v[0:1], off offset:2 ; GFX12-NOUNALIGNED-NEXT: global_load_u8 v6, v[0:1], off offset:4 ; GFX12-NOUNALIGNED-NEXT: global_load_u8 v7, v[0:1], off offset:5 -; GFX12-NOUNALIGNED-NEXT: global_load_u8 v8, v[0:1], off offset:6 ; GFX12-NOUNALIGNED-NEXT: global_load_u8 v9, v[0:1], off offset:7 +; GFX12-NOUNALIGNED-NEXT: global_load_u8 v8, v[0:1], off offset:6 ; GFX12-NOUNALIGNED-NEXT: global_load_u8 v10, v[0:1], off offset:8 ; GFX12-NOUNALIGNED-NEXT: global_load_u8 v11, v[0:1], off offset:9 ; GFX12-NOUNALIGNED-NEXT: global_load_u8 v12, v[0:1], off offset:11 @@ -45,15 +45,15 @@ define <3 x i32> @v_load_constant_v3i32_align1(ptr addrspace(4) %ptr) { ; GFX12-NOUNALIGNED-NEXT: s_wait_loadcnt 0xa ; GFX12-NOUNALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v2 ; GFX12-NOUNALIGNED-NEXT: s_wait_loadcnt 0x9 -; GFX12-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX12-NOUNALIGNED-NEXT: s_wait_loadcnt 0x8 ; GFX12-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 24, v5 +; GFX12-NOUNALIGNED-NEXT: s_wait_loadcnt 0x8 +; GFX12-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; GFX12-NOUNALIGNED-NEXT: s_wait_loadcnt 0x6 ; GFX12-NOUNALIGNED-NEXT: v_lshl_or_b32 v4, v7, 8, v6 ; GFX12-NOUNALIGNED-NEXT: s_wait_loadcnt 0x5 -; GFX12-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v6, 16, v8 -; GFX12-NOUNALIGNED-NEXT: s_wait_loadcnt 0x4 ; GFX12-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v5, 24, v9 +; GFX12-NOUNALIGNED-NEXT: s_wait_loadcnt 0x4 +; GFX12-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v6, 16, v8 ; GFX12-NOUNALIGNED-NEXT: s_wait_loadcnt 0x2 ; GFX12-NOUNALIGNED-NEXT: v_lshl_or_b32 v7, v11, 8, v10 ; GFX12-NOUNALIGNED-NEXT: s_wait_loadcnt 0x1 @@ -81,12 +81,12 @@ define <3 x i32> @v_load_constant_v3i32_align1(ptr addrspace(4) %ptr) { ; GFX1250-NOUNALIGNED-NEXT: s_clause 0xb ; GFX1250-NOUNALIGNED-NEXT: global_load_u8 v2, v[0:1], off ; GFX1250-NOUNALIGNED-NEXT: global_load_u8 v3, v[0:1], off offset:1 -; GFX1250-NOUNALIGNED-NEXT: global_load_u8 v4, v[0:1], off offset:2 ; GFX1250-NOUNALIGNED-NEXT: global_load_u8 v5, v[0:1], off offset:3 +; GFX1250-NOUNALIGNED-NEXT: global_load_u8 v4, v[0:1], off offset:2 ; GFX1250-NOUNALIGNED-NEXT: global_load_u8 v6, v[0:1], off offset:4 ; GFX1250-NOUNALIGNED-NEXT: global_load_u8 v7, v[0:1], off offset:5 -; GFX1250-NOUNALIGNED-NEXT: global_load_u8 v8, v[0:1], off offset:6 ; GFX1250-NOUNALIGNED-NEXT: global_load_u8 v9, v[0:1], off offset:7 +; GFX1250-NOUNALIGNED-NEXT: global_load_u8 v8, v[0:1], off offset:6 ; GFX1250-NOUNALIGNED-NEXT: global_load_u8 v10, v[0:1], off offset:8 ; GFX1250-NOUNALIGNED-NEXT: global_load_u8 v11, v[0:1], off offset:9 ; GFX1250-NOUNALIGNED-NEXT: global_load_u8 v12, v[0:1], off offset:11 @@ -95,11 +95,11 @@ define <3 x i32> @v_load_constant_v3i32_align1(ptr addrspace(4) %ptr) { ; GFX1250-NOUNALIGNED-NEXT: s_wait_xcnt 0x0 ; GFX1250-NOUNALIGNED-NEXT: v_lshl_or_b32 v0, v3, 8, v2 ; GFX1250-NOUNALIGNED-NEXT: s_wait_loadcnt 0x8 -; GFX1250-NOUNALIGNED-NEXT: v_dual_lshlrev_b32 v2, 16, v4 :: v_dual_lshlrev_b32 v1, 24, v5 +; GFX1250-NOUNALIGNED-NEXT: v_dual_lshlrev_b32 v1, 24, v5 :: v_dual_lshlrev_b32 v2, 16, v4 ; GFX1250-NOUNALIGNED-NEXT: s_wait_loadcnt 0x6 ; GFX1250-NOUNALIGNED-NEXT: v_lshl_or_b32 v3, v7, 8, v6 ; GFX1250-NOUNALIGNED-NEXT: s_wait_loadcnt 0x4 -; GFX1250-NOUNALIGNED-NEXT: v_dual_lshlrev_b32 v5, 16, v8 :: v_dual_lshlrev_b32 v4, 24, v9 +; GFX1250-NOUNALIGNED-NEXT: v_dual_lshlrev_b32 v4, 24, v9 :: v_dual_lshlrev_b32 v5, 16, v8 ; GFX1250-NOUNALIGNED-NEXT: v_or3_b32 v0, v1, v2, v0 ; GFX1250-NOUNALIGNED-NEXT: s_wait_loadcnt 0x2 ; GFX1250-NOUNALIGNED-NEXT: v_lshl_or_b32 v6, v11, 8, v10 @@ -122,12 +122,12 @@ define <3 x i32> @v_load_constant_v3i32_align1(ptr addrspace(4) %ptr) { ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v2, v[0:1], off ; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v3, v[0:1], off offset:1 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v4, v[0:1], off offset:2 ; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v5, v[0:1], off offset:3 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v4, v[0:1], off offset:2 ; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v6, v[0:1], off offset:4 ; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v7, v[0:1], off offset:5 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v8, v[0:1], off offset:6 ; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v9, v[0:1], off offset:7 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v8, v[0:1], off offset:6 ; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v10, v[0:1], off offset:8 ; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v11, v[0:1], off offset:9 ; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v12, v[0:1], off offset:11 @@ -135,16 +135,16 @@ define <3 x i32> @v_load_constant_v3i32_align1(ptr addrspace(4) %ptr) { ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(10) ; GFX9-NOUNALIGNED-NEXT: v_lshl_or_b32 v0, v3, 8, v2 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(9) -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(8) ; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 24, v5 +; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(8) +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 16, v4 ; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v0, v1, v2, v0 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(6) ; GFX9-NOUNALIGNED-NEXT: v_lshl_or_b32 v3, v7, 8, v6 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(5) -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v5, 16, v8 -; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(4) ; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v4, 24, v9 +; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(4) +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v5, 16, v8 ; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v1, v4, v5, v3 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2) ; GFX9-NOUNALIGNED-NEXT: v_lshl_or_b32 v6, v11, 8, v10 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.128.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.128.ll index d7fcbd5d623c9..8718777e8d067 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.128.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.128.ll @@ -46,36 +46,39 @@ define <4 x i32> @load_lds_v4i32_align1(ptr addrspace(3) %ptr) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ds_read_u8 v1, v0 ; GFX9-NEXT: ds_read_u8 v2, v0 offset:1 -; GFX9-NEXT: ds_read_u8 v3, v0 offset:2 ; GFX9-NEXT: ds_read_u8 v4, v0 offset:3 +; GFX9-NEXT: ds_read_u8 v3, v0 offset:2 ; GFX9-NEXT: ds_read_u8 v5, v0 offset:4 ; GFX9-NEXT: ds_read_u8 v6, v0 offset:5 -; GFX9-NEXT: ds_read_u8 v7, v0 offset:6 ; GFX9-NEXT: ds_read_u8 v8, v0 offset:7 +; GFX9-NEXT: ds_read_u8 v7, v0 offset:6 ; GFX9-NEXT: s_waitcnt lgkmcnt(6) ; GFX9-NEXT: v_lshl_or_b32 v1, v2, 8, v1 -; GFX9-NEXT: s_waitcnt lgkmcnt(4) +; GFX9-NEXT: s_waitcnt lgkmcnt(5) ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 24, v4 +; GFX9-NEXT: s_waitcnt lgkmcnt(4) ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX9-NEXT: v_or3_b32 v4, v2, v3, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(2) ; GFX9-NEXT: v_lshl_or_b32 v1, v6, 8, v5 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_waitcnt lgkmcnt(1) ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 24, v8 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v7 ; GFX9-NEXT: v_or3_b32 v1, v2, v3, v1 ; GFX9-NEXT: ds_read_u8 v2, v0 offset:8 ; GFX9-NEXT: ds_read_u8 v3, v0 offset:9 -; GFX9-NEXT: ds_read_u8 v5, v0 offset:10 ; GFX9-NEXT: ds_read_u8 v6, v0 offset:11 +; GFX9-NEXT: ds_read_u8 v5, v0 offset:10 ; GFX9-NEXT: ds_read_u8 v7, v0 offset:12 ; GFX9-NEXT: ds_read_u8 v8, v0 offset:13 ; GFX9-NEXT: ds_read_u8 v9, v0 offset:14 ; GFX9-NEXT: ds_read_u8 v0, v0 offset:15 ; GFX9-NEXT: s_waitcnt lgkmcnt(6) ; GFX9-NEXT: v_lshl_or_b32 v2, v3, 8, v2 -; GFX9-NEXT: s_waitcnt lgkmcnt(4) +; GFX9-NEXT: s_waitcnt lgkmcnt(5) ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 24, v6 +; GFX9-NEXT: s_waitcnt lgkmcnt(4) ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX9-NEXT: v_or3_b32 v2, v3, v5, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(2) @@ -91,47 +94,52 @@ define <4 x i32> @load_lds_v4i32_align1(ptr addrspace(3) %ptr) { ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: ds_read_u8 v1, v0 ; GFX7-NEXT: ds_read_u8 v2, v0 offset:1 -; GFX7-NEXT: ds_read_u8 v3, v0 offset:2 +; GFX7-NEXT: ds_read_u8 v1, v0 ; GFX7-NEXT: ds_read_u8 v4, v0 offset:3 -; GFX7-NEXT: ds_read_u8 v5, v0 offset:4 +; GFX7-NEXT: ds_read_u8 v3, v0 offset:2 ; GFX7-NEXT: ds_read_u8 v6, v0 offset:5 -; GFX7-NEXT: ds_read_u8 v7, v0 offset:6 +; GFX7-NEXT: ds_read_u8 v5, v0 offset:4 ; GFX7-NEXT: ds_read_u8 v8, v0 offset:7 -; GFX7-NEXT: s_waitcnt lgkmcnt(6) +; GFX7-NEXT: ds_read_u8 v7, v0 offset:6 +; GFX7-NEXT: s_waitcnt lgkmcnt(7) ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GFX7-NEXT: s_waitcnt lgkmcnt(6) ; GFX7-NEXT: v_or_b32_e32 v1, v2, v1 -; GFX7-NEXT: s_waitcnt lgkmcnt(4) +; GFX7-NEXT: s_waitcnt lgkmcnt(5) ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v4 +; GFX7-NEXT: s_waitcnt lgkmcnt(4) ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX7-NEXT: v_or_b32_e32 v4, v2, v1 -; GFX7-NEXT: s_waitcnt lgkmcnt(2) +; GFX7-NEXT: s_waitcnt lgkmcnt(3) ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v6 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(1) ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v8 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v7 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v5 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX7-NEXT: v_or_b32_e32 v1, v2, v1 -; GFX7-NEXT: ds_read_u8 v2, v0 offset:8 ; GFX7-NEXT: ds_read_u8 v3, v0 offset:9 -; GFX7-NEXT: ds_read_u8 v5, v0 offset:10 +; GFX7-NEXT: ds_read_u8 v2, v0 offset:8 ; GFX7-NEXT: ds_read_u8 v6, v0 offset:11 -; GFX7-NEXT: ds_read_u8 v7, v0 offset:12 +; GFX7-NEXT: ds_read_u8 v5, v0 offset:10 ; GFX7-NEXT: ds_read_u8 v8, v0 offset:13 +; GFX7-NEXT: ds_read_u8 v7, v0 offset:12 ; GFX7-NEXT: ds_read_u8 v9, v0 offset:14 ; GFX7-NEXT: ds_read_u8 v0, v0 offset:15 -; GFX7-NEXT: s_waitcnt lgkmcnt(6) +; GFX7-NEXT: s_waitcnt lgkmcnt(7) ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GFX7-NEXT: s_waitcnt lgkmcnt(6) ; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX7-NEXT: s_waitcnt lgkmcnt(4) +; GFX7-NEXT: s_waitcnt lgkmcnt(5) ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v6 +; GFX7-NEXT: s_waitcnt lgkmcnt(4) ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: v_or_b32_e32 v3, v3, v5 ; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX7-NEXT: s_waitcnt lgkmcnt(2) +; GFX7-NEXT: s_waitcnt lgkmcnt(3) ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v8 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 24, v0 @@ -147,16 +155,16 @@ define <4 x i32> @load_lds_v4i32_align1(ptr addrspace(3) %ptr) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: ds_read_u8 v1, v0 ; GFX10-NEXT: ds_read_u8 v2, v0 offset:1 -; GFX10-NEXT: ds_read_u8 v3, v0 offset:2 ; GFX10-NEXT: ds_read_u8 v4, v0 offset:3 +; GFX10-NEXT: ds_read_u8 v3, v0 offset:2 ; GFX10-NEXT: ds_read_u8 v5, v0 offset:4 ; GFX10-NEXT: ds_read_u8 v6, v0 offset:5 -; GFX10-NEXT: ds_read_u8 v7, v0 offset:6 ; GFX10-NEXT: ds_read_u8 v8, v0 offset:7 +; GFX10-NEXT: ds_read_u8 v7, v0 offset:6 ; GFX10-NEXT: ds_read_u8 v9, v0 offset:8 ; GFX10-NEXT: ds_read_u8 v10, v0 offset:9 -; GFX10-NEXT: ds_read_u8 v11, v0 offset:10 ; GFX10-NEXT: ds_read_u8 v12, v0 offset:11 +; GFX10-NEXT: ds_read_u8 v11, v0 offset:10 ; GFX10-NEXT: ds_read_u8 v13, v0 offset:12 ; GFX10-NEXT: ds_read_u8 v14, v0 offset:13 ; GFX10-NEXT: ds_read_u8 v15, v0 offset:15 @@ -164,21 +172,21 @@ define <4 x i32> @load_lds_v4i32_align1(ptr addrspace(3) %ptr) { ; GFX10-NEXT: s_waitcnt lgkmcnt(14) ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 8, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(13) -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX10-NEXT: s_waitcnt lgkmcnt(12) ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v4 +; GFX10-NEXT: s_waitcnt lgkmcnt(12) +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(10) ; GFX10-NEXT: v_lshl_or_b32 v4, v6, 8, v5 ; GFX10-NEXT: s_waitcnt lgkmcnt(9) -; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; GFX10-NEXT: s_waitcnt lgkmcnt(8) ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v8 +; GFX10-NEXT: s_waitcnt lgkmcnt(8) +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v7 ; GFX10-NEXT: s_waitcnt lgkmcnt(6) ; GFX10-NEXT: v_lshl_or_b32 v7, v10, 8, v9 ; GFX10-NEXT: s_waitcnt lgkmcnt(5) -; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v11 -; GFX10-NEXT: s_waitcnt lgkmcnt(4) ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 24, v12 +; GFX10-NEXT: s_waitcnt lgkmcnt(4) +; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v11 ; GFX10-NEXT: s_waitcnt lgkmcnt(2) ; GFX10-NEXT: v_lshl_or_b32 v10, v14, 8, v13 ; GFX10-NEXT: s_waitcnt lgkmcnt(1) @@ -196,16 +204,16 @@ define <4 x i32> @load_lds_v4i32_align1(ptr addrspace(3) %ptr) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: ds_load_u8 v1, v0 ; GFX11-NEXT: ds_load_u8 v2, v0 offset:1 -; GFX11-NEXT: ds_load_u8 v3, v0 offset:2 ; GFX11-NEXT: ds_load_u8 v4, v0 offset:3 +; GFX11-NEXT: ds_load_u8 v3, v0 offset:2 ; GFX11-NEXT: ds_load_u8 v5, v0 offset:4 ; GFX11-NEXT: ds_load_u8 v6, v0 offset:5 -; GFX11-NEXT: ds_load_u8 v7, v0 offset:6 ; GFX11-NEXT: ds_load_u8 v8, v0 offset:7 +; GFX11-NEXT: ds_load_u8 v7, v0 offset:6 ; GFX11-NEXT: ds_load_u8 v9, v0 offset:8 ; GFX11-NEXT: ds_load_u8 v10, v0 offset:9 -; GFX11-NEXT: ds_load_u8 v11, v0 offset:10 ; GFX11-NEXT: ds_load_u8 v12, v0 offset:11 +; GFX11-NEXT: ds_load_u8 v11, v0 offset:10 ; GFX11-NEXT: ds_load_u8 v13, v0 offset:12 ; GFX11-NEXT: ds_load_u8 v14, v0 offset:13 ; GFX11-NEXT: ds_load_u8 v15, v0 offset:15 @@ -213,21 +221,21 @@ define <4 x i32> @load_lds_v4i32_align1(ptr addrspace(3) %ptr) { ; GFX11-NEXT: s_waitcnt lgkmcnt(14) ; GFX11-NEXT: v_lshl_or_b32 v1, v2, 8, v1 ; GFX11-NEXT: s_waitcnt lgkmcnt(13) -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-NEXT: s_waitcnt lgkmcnt(12) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 24, v4 +; GFX11-NEXT: s_waitcnt lgkmcnt(12) +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX11-NEXT: s_waitcnt lgkmcnt(10) ; GFX11-NEXT: v_lshl_or_b32 v4, v6, 8, v5 ; GFX11-NEXT: s_waitcnt lgkmcnt(9) -; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; GFX11-NEXT: s_waitcnt lgkmcnt(8) ; GFX11-NEXT: v_lshlrev_b32_e32 v5, 24, v8 +; GFX11-NEXT: s_waitcnt lgkmcnt(8) +; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v7 ; GFX11-NEXT: s_waitcnt lgkmcnt(6) ; GFX11-NEXT: v_lshl_or_b32 v7, v10, 8, v9 ; GFX11-NEXT: s_waitcnt lgkmcnt(5) -; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v11 -; GFX11-NEXT: s_waitcnt lgkmcnt(4) ; GFX11-NEXT: v_lshlrev_b32_e32 v8, 24, v12 +; GFX11-NEXT: s_waitcnt lgkmcnt(4) +; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v11 ; GFX11-NEXT: s_waitcnt lgkmcnt(2) ; GFX11-NEXT: v_lshl_or_b32 v10, v14, 8, v13 ; GFX11-NEXT: s_waitcnt lgkmcnt(1) @@ -270,25 +278,28 @@ define <4 x i32> @load_lds_v4i32_align2(ptr addrspace(3) %ptr) { ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: ds_read_u16 v1, v0 ; GFX7-NEXT: ds_read_u16 v2, v0 offset:2 -; GFX7-NEXT: ds_read_u16 v3, v0 offset:4 +; GFX7-NEXT: ds_read_u16 v1, v0 ; GFX7-NEXT: ds_read_u16 v4, v0 offset:6 -; GFX7-NEXT: ds_read_u16 v5, v0 offset:8 +; GFX7-NEXT: ds_read_u16 v3, v0 offset:4 ; GFX7-NEXT: ds_read_u16 v6, v0 offset:10 -; GFX7-NEXT: ds_read_u16 v7, v0 offset:12 +; GFX7-NEXT: ds_read_u16 v5, v0 offset:8 ; GFX7-NEXT: ds_read_u16 v8, v0 offset:14 -; GFX7-NEXT: s_waitcnt lgkmcnt(6) +; GFX7-NEXT: ds_read_u16 v7, v0 offset:12 +; GFX7-NEXT: s_waitcnt lgkmcnt(7) ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX7-NEXT: s_waitcnt lgkmcnt(6) ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX7-NEXT: s_waitcnt lgkmcnt(4) +; GFX7-NEXT: s_waitcnt lgkmcnt(5) ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX7-NEXT: s_waitcnt lgkmcnt(4) ; GFX7-NEXT: v_or_b32_e32 v1, v1, v3 -; GFX7-NEXT: s_waitcnt lgkmcnt(2) +; GFX7-NEXT: s_waitcnt lgkmcnt(3) ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(1) ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v8 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_or_b32_e32 v3, v3, v7 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.96.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.96.ll index 191f2e0670e15..2a8334ab31b7b 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.96.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.96.ll @@ -46,16 +46,17 @@ define <3 x i32> @load_lds_v3i32_align1(ptr addrspace(3) %ptr) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ds_read_u8 v1, v0 ; GFX9-NEXT: ds_read_u8 v2, v0 offset:1 -; GFX9-NEXT: ds_read_u8 v3, v0 offset:2 ; GFX9-NEXT: ds_read_u8 v4, v0 offset:3 +; GFX9-NEXT: ds_read_u8 v3, v0 offset:2 ; GFX9-NEXT: ds_read_u8 v5, v0 offset:4 ; GFX9-NEXT: ds_read_u8 v6, v0 offset:5 -; GFX9-NEXT: ds_read_u8 v7, v0 offset:6 ; GFX9-NEXT: ds_read_u8 v8, v0 offset:7 +; GFX9-NEXT: ds_read_u8 v7, v0 offset:6 ; GFX9-NEXT: s_waitcnt lgkmcnt(6) ; GFX9-NEXT: v_lshl_or_b32 v1, v2, 8, v1 -; GFX9-NEXT: s_waitcnt lgkmcnt(4) +; GFX9-NEXT: s_waitcnt lgkmcnt(5) ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 24, v4 +; GFX9-NEXT: s_waitcnt lgkmcnt(4) ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX9-NEXT: v_or3_b32 v3, v2, v3, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(2) @@ -64,8 +65,9 @@ define <3 x i32> @load_lds_v3i32_align1(ptr addrspace(3) %ptr) { ; GFX9-NEXT: ds_read_u8 v4, v0 offset:9 ; GFX9-NEXT: ds_read_u8 v5, v0 offset:10 ; GFX9-NEXT: ds_read_u8 v0, v0 offset:11 -; GFX9-NEXT: s_waitcnt lgkmcnt(4) +; GFX9-NEXT: s_waitcnt lgkmcnt(5) ; GFX9-NEXT: v_lshlrev_b32_e32 v6, 24, v8 +; GFX9-NEXT: s_waitcnt lgkmcnt(4) ; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; GFX9-NEXT: s_waitcnt lgkmcnt(2) ; GFX9-NEXT: v_lshl_or_b32 v2, v4, 8, v2 @@ -82,36 +84,38 @@ define <3 x i32> @load_lds_v3i32_align1(ptr addrspace(3) %ptr) { ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: ds_read_u8 v1, v0 ; GFX7-NEXT: ds_read_u8 v2, v0 offset:1 -; GFX7-NEXT: ds_read_u8 v3, v0 offset:2 +; GFX7-NEXT: ds_read_u8 v1, v0 ; GFX7-NEXT: ds_read_u8 v4, v0 offset:3 -; GFX7-NEXT: ds_read_u8 v5, v0 offset:4 +; GFX7-NEXT: ds_read_u8 v3, v0 offset:2 ; GFX7-NEXT: ds_read_u8 v6, v0 offset:5 -; GFX7-NEXT: ds_read_u8 v7, v0 offset:6 +; GFX7-NEXT: ds_read_u8 v5, v0 offset:4 ; GFX7-NEXT: ds_read_u8 v8, v0 offset:7 -; GFX7-NEXT: s_waitcnt lgkmcnt(6) +; GFX7-NEXT: ds_read_u8 v7, v0 offset:6 +; GFX7-NEXT: s_waitcnt lgkmcnt(7) ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GFX7-NEXT: s_waitcnt lgkmcnt(6) ; GFX7-NEXT: v_or_b32_e32 v1, v2, v1 -; GFX7-NEXT: s_waitcnt lgkmcnt(4) +; GFX7-NEXT: s_waitcnt lgkmcnt(5) ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v4 +; GFX7-NEXT: s_waitcnt lgkmcnt(4) ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX7-NEXT: v_or_b32_e32 v3, v2, v1 -; GFX7-NEXT: s_waitcnt lgkmcnt(2) +; GFX7-NEXT: s_waitcnt lgkmcnt(3) ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v6 +; GFX7-NEXT: s_waitcnt lgkmcnt(2) ; GFX7-NEXT: v_or_b32_e32 v1, v1, v5 -; GFX7-NEXT: s_waitcnt lgkmcnt(1) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v7 -; GFX7-NEXT: ds_read_u8 v5, v0 offset:8 ; GFX7-NEXT: ds_read_u8 v6, v0 offset:9 +; GFX7-NEXT: ds_read_u8 v5, v0 offset:8 ; GFX7-NEXT: ds_read_u8 v7, v0 offset:10 ; GFX7-NEXT: ds_read_u8 v0, v0 offset:11 -; GFX7-NEXT: s_waitcnt lgkmcnt(4) ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v8 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 ; GFX7-NEXT: v_or_b32_e32 v1, v2, v1 -; GFX7-NEXT: s_waitcnt lgkmcnt(2) +; GFX7-NEXT: s_waitcnt lgkmcnt(3) ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v6 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 24, v0 @@ -127,12 +131,12 @@ define <3 x i32> @load_lds_v3i32_align1(ptr addrspace(3) %ptr) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: ds_read_u8 v1, v0 ; GFX10-NEXT: ds_read_u8 v2, v0 offset:1 -; GFX10-NEXT: ds_read_u8 v3, v0 offset:2 ; GFX10-NEXT: ds_read_u8 v4, v0 offset:3 +; GFX10-NEXT: ds_read_u8 v3, v0 offset:2 ; GFX10-NEXT: ds_read_u8 v5, v0 offset:4 ; GFX10-NEXT: ds_read_u8 v6, v0 offset:5 -; GFX10-NEXT: ds_read_u8 v7, v0 offset:6 ; GFX10-NEXT: ds_read_u8 v8, v0 offset:7 +; GFX10-NEXT: ds_read_u8 v7, v0 offset:6 ; GFX10-NEXT: ds_read_u8 v9, v0 offset:8 ; GFX10-NEXT: ds_read_u8 v10, v0 offset:9 ; GFX10-NEXT: ds_read_u8 v11, v0 offset:11 @@ -140,15 +144,15 @@ define <3 x i32> @load_lds_v3i32_align1(ptr addrspace(3) %ptr) { ; GFX10-NEXT: s_waitcnt lgkmcnt(10) ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 8, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(9) -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX10-NEXT: s_waitcnt lgkmcnt(8) ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v4 +; GFX10-NEXT: s_waitcnt lgkmcnt(8) +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(6) ; GFX10-NEXT: v_lshl_or_b32 v4, v6, 8, v5 ; GFX10-NEXT: s_waitcnt lgkmcnt(5) -; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; GFX10-NEXT: s_waitcnt lgkmcnt(4) ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v8 +; GFX10-NEXT: s_waitcnt lgkmcnt(4) +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v7 ; GFX10-NEXT: s_waitcnt lgkmcnt(2) ; GFX10-NEXT: v_lshl_or_b32 v7, v10, 8, v9 ; GFX10-NEXT: s_waitcnt lgkmcnt(1) @@ -165,12 +169,12 @@ define <3 x i32> @load_lds_v3i32_align1(ptr addrspace(3) %ptr) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: ds_load_u8 v1, v0 ; GFX11-NEXT: ds_load_u8 v2, v0 offset:1 -; GFX11-NEXT: ds_load_u8 v3, v0 offset:2 ; GFX11-NEXT: ds_load_u8 v4, v0 offset:3 +; GFX11-NEXT: ds_load_u8 v3, v0 offset:2 ; GFX11-NEXT: ds_load_u8 v5, v0 offset:4 ; GFX11-NEXT: ds_load_u8 v6, v0 offset:5 -; GFX11-NEXT: ds_load_u8 v7, v0 offset:6 ; GFX11-NEXT: ds_load_u8 v8, v0 offset:7 +; GFX11-NEXT: ds_load_u8 v7, v0 offset:6 ; GFX11-NEXT: ds_load_u8 v9, v0 offset:8 ; GFX11-NEXT: ds_load_u8 v10, v0 offset:9 ; GFX11-NEXT: ds_load_u8 v11, v0 offset:11 @@ -178,15 +182,15 @@ define <3 x i32> @load_lds_v3i32_align1(ptr addrspace(3) %ptr) { ; GFX11-NEXT: s_waitcnt lgkmcnt(10) ; GFX11-NEXT: v_lshl_or_b32 v1, v2, 8, v1 ; GFX11-NEXT: s_waitcnt lgkmcnt(9) -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-NEXT: s_waitcnt lgkmcnt(8) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 24, v4 +; GFX11-NEXT: s_waitcnt lgkmcnt(8) +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX11-NEXT: s_waitcnt lgkmcnt(6) ; GFX11-NEXT: v_lshl_or_b32 v4, v6, 8, v5 ; GFX11-NEXT: s_waitcnt lgkmcnt(5) -; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; GFX11-NEXT: s_waitcnt lgkmcnt(4) ; GFX11-NEXT: v_lshlrev_b32_e32 v5, 24, v8 +; GFX11-NEXT: s_waitcnt lgkmcnt(4) +; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v7 ; GFX11-NEXT: s_waitcnt lgkmcnt(2) ; GFX11-NEXT: v_lshl_or_b32 v7, v10, 8, v9 ; GFX11-NEXT: s_waitcnt lgkmcnt(1) @@ -224,20 +228,22 @@ define <3 x i32> @load_lds_v3i32_align2(ptr addrspace(3) %ptr) { ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: ds_read_u16 v1, v0 ; GFX7-NEXT: ds_read_u16 v2, v0 offset:2 -; GFX7-NEXT: ds_read_u16 v3, v0 offset:4 +; GFX7-NEXT: ds_read_u16 v1, v0 ; GFX7-NEXT: ds_read_u16 v4, v0 offset:6 -; GFX7-NEXT: ds_read_u16 v5, v0 offset:8 +; GFX7-NEXT: ds_read_u16 v3, v0 offset:4 ; GFX7-NEXT: ds_read_u16 v6, v0 offset:10 -; GFX7-NEXT: s_waitcnt lgkmcnt(4) +; GFX7-NEXT: ds_read_u16 v5, v0 offset:8 +; GFX7-NEXT: s_waitcnt lgkmcnt(5) ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX7-NEXT: s_waitcnt lgkmcnt(4) ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX7-NEXT: s_waitcnt lgkmcnt(2) +; GFX7-NEXT: s_waitcnt lgkmcnt(3) ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX7-NEXT: s_waitcnt lgkmcnt(1) +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v5 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll index b1de0eff05d30..d319ae066aae2 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll @@ -22,47 +22,52 @@ define <4 x i32> @load_lds_v4i32_align1(ptr addrspace(3) %ptr) { ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: ds_read_u8 v1, v0 ; GFX7-NEXT: ds_read_u8 v2, v0 offset:1 -; GFX7-NEXT: ds_read_u8 v3, v0 offset:2 +; GFX7-NEXT: ds_read_u8 v1, v0 ; GFX7-NEXT: ds_read_u8 v4, v0 offset:3 -; GFX7-NEXT: ds_read_u8 v5, v0 offset:4 +; GFX7-NEXT: ds_read_u8 v3, v0 offset:2 ; GFX7-NEXT: ds_read_u8 v6, v0 offset:5 -; GFX7-NEXT: ds_read_u8 v7, v0 offset:6 +; GFX7-NEXT: ds_read_u8 v5, v0 offset:4 ; GFX7-NEXT: ds_read_u8 v8, v0 offset:7 -; GFX7-NEXT: s_waitcnt lgkmcnt(6) +; GFX7-NEXT: ds_read_u8 v7, v0 offset:6 +; GFX7-NEXT: s_waitcnt lgkmcnt(7) ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GFX7-NEXT: s_waitcnt lgkmcnt(6) ; GFX7-NEXT: v_or_b32_e32 v1, v2, v1 -; GFX7-NEXT: s_waitcnt lgkmcnt(4) +; GFX7-NEXT: s_waitcnt lgkmcnt(5) ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v4 +; GFX7-NEXT: s_waitcnt lgkmcnt(4) ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX7-NEXT: v_or_b32_e32 v4, v2, v1 -; GFX7-NEXT: s_waitcnt lgkmcnt(2) +; GFX7-NEXT: s_waitcnt lgkmcnt(3) ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v6 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(1) ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v8 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v7 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v5 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX7-NEXT: v_or_b32_e32 v1, v2, v1 -; GFX7-NEXT: ds_read_u8 v2, v0 offset:8 ; GFX7-NEXT: ds_read_u8 v3, v0 offset:9 -; GFX7-NEXT: ds_read_u8 v5, v0 offset:10 +; GFX7-NEXT: ds_read_u8 v2, v0 offset:8 ; GFX7-NEXT: ds_read_u8 v6, v0 offset:11 -; GFX7-NEXT: ds_read_u8 v7, v0 offset:12 +; GFX7-NEXT: ds_read_u8 v5, v0 offset:10 ; GFX7-NEXT: ds_read_u8 v8, v0 offset:13 +; GFX7-NEXT: ds_read_u8 v7, v0 offset:12 ; GFX7-NEXT: ds_read_u8 v9, v0 offset:14 ; GFX7-NEXT: ds_read_u8 v0, v0 offset:15 -; GFX7-NEXT: s_waitcnt lgkmcnt(6) +; GFX7-NEXT: s_waitcnt lgkmcnt(7) ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GFX7-NEXT: s_waitcnt lgkmcnt(6) ; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX7-NEXT: s_waitcnt lgkmcnt(4) +; GFX7-NEXT: s_waitcnt lgkmcnt(5) ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v6 +; GFX7-NEXT: s_waitcnt lgkmcnt(4) ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: v_or_b32_e32 v3, v3, v5 ; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX7-NEXT: s_waitcnt lgkmcnt(2) +; GFX7-NEXT: s_waitcnt lgkmcnt(3) ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v8 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 24, v0 @@ -104,36 +109,38 @@ define <3 x i32> @load_lds_v3i32_align1(ptr addrspace(3) %ptr) { ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: ds_read_u8 v1, v0 ; GFX7-NEXT: ds_read_u8 v2, v0 offset:1 -; GFX7-NEXT: ds_read_u8 v3, v0 offset:2 +; GFX7-NEXT: ds_read_u8 v1, v0 ; GFX7-NEXT: ds_read_u8 v4, v0 offset:3 -; GFX7-NEXT: ds_read_u8 v5, v0 offset:4 +; GFX7-NEXT: ds_read_u8 v3, v0 offset:2 ; GFX7-NEXT: ds_read_u8 v6, v0 offset:5 -; GFX7-NEXT: ds_read_u8 v7, v0 offset:6 +; GFX7-NEXT: ds_read_u8 v5, v0 offset:4 ; GFX7-NEXT: ds_read_u8 v8, v0 offset:7 -; GFX7-NEXT: s_waitcnt lgkmcnt(6) +; GFX7-NEXT: ds_read_u8 v7, v0 offset:6 +; GFX7-NEXT: s_waitcnt lgkmcnt(7) ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GFX7-NEXT: s_waitcnt lgkmcnt(6) ; GFX7-NEXT: v_or_b32_e32 v1, v2, v1 -; GFX7-NEXT: s_waitcnt lgkmcnt(4) +; GFX7-NEXT: s_waitcnt lgkmcnt(5) ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v4 +; GFX7-NEXT: s_waitcnt lgkmcnt(4) ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX7-NEXT: v_or_b32_e32 v3, v2, v1 -; GFX7-NEXT: s_waitcnt lgkmcnt(2) +; GFX7-NEXT: s_waitcnt lgkmcnt(3) ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v6 +; GFX7-NEXT: s_waitcnt lgkmcnt(2) ; GFX7-NEXT: v_or_b32_e32 v1, v1, v5 -; GFX7-NEXT: s_waitcnt lgkmcnt(1) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v7 -; GFX7-NEXT: ds_read_u8 v5, v0 offset:8 ; GFX7-NEXT: ds_read_u8 v6, v0 offset:9 +; GFX7-NEXT: ds_read_u8 v5, v0 offset:8 ; GFX7-NEXT: ds_read_u8 v7, v0 offset:10 ; GFX7-NEXT: ds_read_u8 v0, v0 offset:11 -; GFX7-NEXT: s_waitcnt lgkmcnt(4) ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v8 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 ; GFX7-NEXT: v_or_b32_e32 v1, v2, v1 -; GFX7-NEXT: s_waitcnt lgkmcnt(2) +; GFX7-NEXT: s_waitcnt lgkmcnt(3) ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v6 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 24, v0 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll index 74552a500ac51..4078058ea2196 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll @@ -3121,8 +3121,8 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; kill: killed $vgpr36 @@ -3284,7 +3284,7 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; kill: killed $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -3522,6 +3522,7 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB12_4 ; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32 @@ -4333,8 +4334,8 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: ; kill: killed $vgpr39 @@ -5302,8 +5303,8 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: ; kill: killed $vgpr40 @@ -5493,7 +5494,7 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(29) +; GFX9-NEXT: s_waitcnt vmcnt(30) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; kill: killed $vgpr33 @@ -5502,6 +5503,7 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB12_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_waitcnt vmcnt(29) ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v32 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v32 @@ -5697,6 +5699,7 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB12_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: s_waitcnt vmcnt(29) ; GFX9-NEXT: v_add_u32_e32 v32, 3, v32 ; GFX9-NEXT: s_waitcnt vmcnt(28) ; GFX9-NEXT: v_add_u32_e32 v31, 3, v31 @@ -11806,28 +11809,29 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v52 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:188 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:192 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:188 ; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:196 +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(5) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:196 +; SI-NEXT: s_waitcnt vmcnt(5) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(5) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:204 @@ -11837,28 +11841,29 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:212 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:220 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:224 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:220 ; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:228 +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(5) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:228 +; SI-NEXT: s_waitcnt vmcnt(5) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(5) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:236 @@ -11868,28 +11873,29 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:244 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:252 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:256 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:252 ; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:260 +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(5) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:260 +; SI-NEXT: s_waitcnt vmcnt(5) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(5) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:268 @@ -11899,28 +11905,29 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:284 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:288 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:312 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:284 ; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:292 +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(5) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:292 +; SI-NEXT: s_waitcnt vmcnt(5) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(5) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:300 @@ -11930,28 +11937,29 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:308 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:316 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:320 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:344 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:316 ; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:324 +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(5) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:324 +; SI-NEXT: s_waitcnt vmcnt(5) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(5) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:332 @@ -11961,26 +11969,28 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:340 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:348 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:352 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:360 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:368 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:376 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:348 ; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:356 +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(5) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:356 +; SI-NEXT: s_waitcnt vmcnt(5) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_lshlrev_b32_e32 v43, 8, v3 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:364 @@ -11990,12 +12000,12 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:372 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:384 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:384 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:44 ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:36 @@ -13322,25 +13332,27 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v56 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188 ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:192 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:200 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:208 ; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:216 -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188 ; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196 +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196 +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:204 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -13348,25 +13360,27 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:212 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:220 ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:224 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:232 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:240 ; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:248 -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:220 ; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:228 +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:228 +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:236 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -13374,25 +13388,27 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:244 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:252 ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:256 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:264 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:272 ; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:280 -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:252 ; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:260 +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:260 +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:268 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -13400,25 +13416,27 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:276 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:284 ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:288 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:296 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:304 ; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:312 -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:284 ; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:292 +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:292 +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:300 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -13426,25 +13444,27 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:308 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:316 ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:320 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:328 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:336 ; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:344 -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:316 ; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:324 +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:324 +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:332 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -13452,25 +13472,27 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:340 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:348 ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:352 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:360 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:368 ; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:376 -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:348 ; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:356 +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:356 +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -13478,11 +13500,9 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:372 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v63, 8, v0 -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44 ; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:36 @@ -13498,6 +13518,8 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68 ; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:60 ; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52 +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_lshlrev_b16_e32 v63, 8, v0 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -14555,26 +14577,28 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v56 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:192 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188 ; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:192 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:200 ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:208 ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:216 -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188 -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196 +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:204 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -14582,26 +14606,28 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:212 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:220 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:224 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:232 ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:240 ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:248 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:220 -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:228 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:228 +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:236 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -14609,26 +14635,28 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:244 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:252 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:256 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:264 ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:272 ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:280 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:252 -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:260 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:260 +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:268 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -14636,26 +14664,28 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:276 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:284 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:288 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:296 ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:304 ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:312 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:284 -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:292 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:292 +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:300 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -14663,26 +14693,28 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:308 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:316 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:320 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:328 ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:336 ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:344 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:316 -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:324 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:324 +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:332 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -14690,26 +14722,28 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:340 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:348 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:352 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:360 ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:368 ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:376 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:348 -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:356 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:356 +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -14717,11 +14751,10 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:372 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v63, 8, v0 +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v63, 8, v0 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44 ; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:36 @@ -15639,50 +15672,6 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_clause 0x1f -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:384 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:380 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:376 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:372 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v51, off, s32 offset:368 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:364 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:360 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:356 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:352 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:348 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:344 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:340 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v52, off, s32 offset:336 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:332 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v54, off, s32 offset:328 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:324 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:320 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:316 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v53, off, s32 offset:312 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:308 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:304 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:300 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:296 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:292 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:288 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:284 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:280 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:276 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:272 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:268 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:264 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v39, off, s32 offset:260 -; GFX11-TRUE16-NEXT: s_clause 0x1f -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:256 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:252 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:248 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v48, off, s32 offset:244 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:240 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:236 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v70, off, s32 offset:232 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v49, off, s32 offset:228 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:224 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:220 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v80, off, s32 offset:216 ; GFX11-TRUE16-NEXT: scratch_load_b32 v113, off, s32 offset:388 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v83, off, s32 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:8 @@ -15704,7 +15693,6 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v161, off, s32 offset:136 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v161, off, s32 offset:144 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v162, off, s32 offset:152 -; GFX11-TRUE16-NEXT: s_clause 0x1f ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v162, off, s32 offset:160 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v163, off, s32 offset:168 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v163, off, s32 offset:176 @@ -15712,6 +15700,51 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v164, off, s32 offset:192 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:200 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v165, off, s32 offset:208 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v80, off, s32 offset:216 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:224 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v70, off, s32 offset:232 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:240 +; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:248 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:256 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:264 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:272 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:280 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:288 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:296 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:304 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v53, off, s32 offset:312 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:320 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v54, off, s32 offset:328 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v52, off, s32 offset:336 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:344 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:352 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:360 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v51, off, s32 offset:368 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:376 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:384 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:380 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:372 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:364 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:356 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:348 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:340 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:332 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:324 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:316 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:308 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:300 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:292 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:284 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:276 +; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:268 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v39, off, s32 offset:260 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:252 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v48, off, s32 offset:244 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:236 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v49, off, s32 offset:228 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:220 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:212 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:204 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:196 @@ -15772,21 +15805,6 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.h, 8, v27.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.l, 8, v29.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(62) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v51.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v51.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(56) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v50.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v54.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v54.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(26) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v53.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v67.l, 8, v67.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.h, 8, v66.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.l, 8, v66.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.h, 8, v71.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.l, 8, v71.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(18) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.h, 8, v70.h ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v113 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(13) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.h, 8, v83.h @@ -15821,12 +15839,24 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v84.h, 8, v165.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.l, 8, v80.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.h, 8, v80.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.h, 8, v70.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.l, 8, v71.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.h, 8, v71.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v80.l, 8, v68.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v80.h, 8, v67.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.l, 8, v66.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.h, 8, v66.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v67.l, 8, v67.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v67.h, 8, v55.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.l, 8, v55.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v53.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v54.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v54.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v52.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.h, 8, v52.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v50.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v51.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v51.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v31.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v31.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 @@ -16424,50 +16454,6 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v52, v6 :: v_dual_mov_b32 v53, v4 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v54, v2 :: v_dual_mov_b32 v55, v0 ; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:384 -; GFX11-FAKE16-NEXT: scratch_load_u16 v64, off, s32 offset:380 -; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:376 -; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:372 -; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:368 -; GFX11-FAKE16-NEXT: scratch_load_u16 v66, off, s32 offset:364 -; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:360 -; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:356 -; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:352 -; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:348 -; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:344 -; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:340 -; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:336 -; GFX11-FAKE16-NEXT: scratch_load_u16 v70, off, s32 offset:332 -; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:328 -; GFX11-FAKE16-NEXT: scratch_load_u16 v71, off, s32 offset:324 -; GFX11-FAKE16-NEXT: scratch_load_u16 v16, off, s32 offset:320 -; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:316 -; GFX11-FAKE16-NEXT: scratch_load_u16 v18, off, s32 offset:312 -; GFX11-FAKE16-NEXT: scratch_load_u16 v81, off, s32 offset:308 -; GFX11-FAKE16-NEXT: scratch_load_u16 v20, off, s32 offset:304 -; GFX11-FAKE16-NEXT: scratch_load_u16 v82, off, s32 offset:300 -; GFX11-FAKE16-NEXT: scratch_load_u16 v22, off, s32 offset:296 -; GFX11-FAKE16-NEXT: scratch_load_u16 v83, off, s32 offset:292 -; GFX11-FAKE16-NEXT: scratch_load_u16 v24, off, s32 offset:288 -; GFX11-FAKE16-NEXT: scratch_load_u16 v84, off, s32 offset:284 -; GFX11-FAKE16-NEXT: scratch_load_u16 v26, off, s32 offset:280 -; GFX11-FAKE16-NEXT: scratch_load_u16 v85, off, s32 offset:276 -; GFX11-FAKE16-NEXT: scratch_load_u16 v28, off, s32 offset:272 -; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:268 -; GFX11-FAKE16-NEXT: scratch_load_u16 v30, off, s32 offset:264 -; GFX11-FAKE16-NEXT: scratch_load_u16 v87, off, s32 offset:260 -; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_load_u16 v31, off, s32 offset:256 -; GFX11-FAKE16-NEXT: scratch_load_u16 v96, off, s32 offset:252 -; GFX11-FAKE16-NEXT: scratch_load_u16 v99, off, s32 offset:248 -; GFX11-FAKE16-NEXT: scratch_load_u16 v97, off, s32 offset:244 -; GFX11-FAKE16-NEXT: scratch_load_u16 v100, off, s32 offset:240 -; GFX11-FAKE16-NEXT: scratch_load_u16 v98, off, s32 offset:236 -; GFX11-FAKE16-NEXT: scratch_load_u16 v101, off, s32 offset:232 -; GFX11-FAKE16-NEXT: scratch_load_u16 v112, off, s32 offset:228 -; GFX11-FAKE16-NEXT: scratch_load_u16 v102, off, s32 offset:224 -; GFX11-FAKE16-NEXT: scratch_load_u16 v113, off, s32 offset:220 -; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:216 ; GFX11-FAKE16-NEXT: scratch_load_b32 v114, off, s32 offset:388 ; GFX11-FAKE16-NEXT: scratch_load_u16 v115, off, s32 ; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:8 @@ -16489,7 +16475,6 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_load_u16 v165, off, s32 offset:136 ; GFX11-FAKE16-NEXT: scratch_load_u16 v166, off, s32 offset:144 ; GFX11-FAKE16-NEXT: scratch_load_u16 v179, off, s32 offset:152 -; GFX11-FAKE16-NEXT: s_clause 0x1f ; GFX11-FAKE16-NEXT: scratch_load_u16 v180, off, s32 offset:160 ; GFX11-FAKE16-NEXT: scratch_load_u16 v181, off, s32 offset:168 ; GFX11-FAKE16-NEXT: scratch_load_u16 v182, off, s32 offset:176 @@ -16497,6 +16482,51 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_load_u16 v136, off, s32 offset:192 ; GFX11-FAKE16-NEXT: scratch_load_u16 v137, off, s32 offset:200 ; GFX11-FAKE16-NEXT: scratch_load_u16 v138, off, s32 offset:208 +; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:216 +; GFX11-FAKE16-NEXT: scratch_load_u16 v102, off, s32 offset:224 +; GFX11-FAKE16-NEXT: scratch_load_u16 v101, off, s32 offset:232 +; GFX11-FAKE16-NEXT: scratch_load_u16 v100, off, s32 offset:240 +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_load_u16 v99, off, s32 offset:248 +; GFX11-FAKE16-NEXT: scratch_load_u16 v31, off, s32 offset:256 +; GFX11-FAKE16-NEXT: scratch_load_u16 v30, off, s32 offset:264 +; GFX11-FAKE16-NEXT: scratch_load_u16 v28, off, s32 offset:272 +; GFX11-FAKE16-NEXT: scratch_load_u16 v26, off, s32 offset:280 +; GFX11-FAKE16-NEXT: scratch_load_u16 v24, off, s32 offset:288 +; GFX11-FAKE16-NEXT: scratch_load_u16 v22, off, s32 offset:296 +; GFX11-FAKE16-NEXT: scratch_load_u16 v20, off, s32 offset:304 +; GFX11-FAKE16-NEXT: scratch_load_u16 v18, off, s32 offset:312 +; GFX11-FAKE16-NEXT: scratch_load_u16 v16, off, s32 offset:320 +; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:328 +; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:336 +; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:344 +; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:352 +; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:360 +; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:368 +; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:376 +; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:384 +; GFX11-FAKE16-NEXT: scratch_load_u16 v64, off, s32 offset:380 +; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:372 +; GFX11-FAKE16-NEXT: scratch_load_u16 v66, off, s32 offset:364 +; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:356 +; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:348 +; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:340 +; GFX11-FAKE16-NEXT: scratch_load_u16 v70, off, s32 offset:332 +; GFX11-FAKE16-NEXT: scratch_load_u16 v71, off, s32 offset:324 +; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:316 +; GFX11-FAKE16-NEXT: scratch_load_u16 v81, off, s32 offset:308 +; GFX11-FAKE16-NEXT: scratch_load_u16 v82, off, s32 offset:300 +; GFX11-FAKE16-NEXT: scratch_load_u16 v83, off, s32 offset:292 +; GFX11-FAKE16-NEXT: scratch_load_u16 v84, off, s32 offset:284 +; GFX11-FAKE16-NEXT: scratch_load_u16 v85, off, s32 offset:276 +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:268 +; GFX11-FAKE16-NEXT: scratch_load_u16 v87, off, s32 offset:260 +; GFX11-FAKE16-NEXT: scratch_load_u16 v96, off, s32 offset:252 +; GFX11-FAKE16-NEXT: scratch_load_u16 v97, off, s32 offset:244 +; GFX11-FAKE16-NEXT: scratch_load_u16 v98, off, s32 offset:236 +; GFX11-FAKE16-NEXT: scratch_load_u16 v112, off, s32 offset:228 +; GFX11-FAKE16-NEXT: scratch_load_u16 v113, off, s32 offset:220 ; GFX11-FAKE16-NEXT: scratch_load_u16 v132, off, s32 offset:212 ; GFX11-FAKE16-NEXT: scratch_load_u16 v133, off, s32 offset:204 ; GFX11-FAKE16-NEXT: scratch_load_u16 v134, off, s32 offset:196 @@ -16540,61 +16570,34 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v109, 8, v25 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v110, 8, v27 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v93, 8, v29 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(54) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(62) ; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v114 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(53) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v94, 8, v115 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(52) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v95, 8, v116 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(51) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v104, 8, v117 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(50) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v105, 8, v118 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(49) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v79, 8, v119 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(48) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v88, 8, v128 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(47) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v89, 8, v129 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(46) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v90, 8, v130 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(45) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v91, 8, v131 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(44) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v61, 8, v144 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(43) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v62, 8, v145 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(42) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v63, 8, v146 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(41) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v72, 8, v147 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(40) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v73, 8, v148 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(39) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v45, 8, v162 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(38) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v46, 8, v163 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(37) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v47, 8, v164 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(36) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v56, 8, v165 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(35) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v57, 8, v166 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(34) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v179, 8, v179 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(33) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v180, 8, v180 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(32) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v181, 8, v181 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(31) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v182, 8, v182 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(30) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v183, 8, v183 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(29) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v162, 8, v136 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(28) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v163, 8, v137 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(27) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v164, 8, v138 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v165, 8, v103 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v166, 8, v102 @@ -16604,19 +16607,33 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v147, 8, v31 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v148, 8, v30 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v119, 8, v28 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(61) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v128, 8, v26 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(60) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v129, 8, v24 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(59) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v130, 8, v22 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(58) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v131, 8, v20 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(57) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v114, 8, v18 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(56) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v115, 8, v16 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(55) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v116, 8, v14 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(54) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v117, 8, v12 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(53) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v118, 8, v10 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(52) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v99, 8, v8 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(51) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v100, 8, v6 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(50) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v101, 8, v4 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(49) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v102, 8, v2 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(48) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v103, 8, v0 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -17606,30 +17623,29 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:108 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:116 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:124 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:132 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:140 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:148 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:156 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:164 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:172 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:180 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:188 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:180 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:172 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:164 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:156 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:148 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:140 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:132 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:124 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:116 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:44 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v3 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:196 @@ -17643,14 +17659,14 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:212 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:220 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:244 ; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:228 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:220 ; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:236 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:244 ; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:252 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:268 @@ -18840,11 +18856,11 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:312 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:320 ; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:328 -; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:28 ; VI-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:4 ; VI-NEXT: s_waitcnt vmcnt(12) ; VI-NEXT: v_lshlrev_b32_e32 v31, 8, v5 ; VI-NEXT: s_waitcnt vmcnt(11) @@ -18859,44 +18875,44 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:52 -; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:60 -; VI-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:68 -; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:76 -; VI-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:84 -; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:92 -; VI-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:100 -; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:108 -; VI-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:116 -; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:124 -; VI-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:132 -; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:140 -; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:148 -; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:156 -; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:164 -; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:172 -; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:180 -; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:188 -; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:196 -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:204 -; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:212 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:276 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:260 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:244 +; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:236 ; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:220 +; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:212 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:204 +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:196 +; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:188 +; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:180 +; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:172 +; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:164 +; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:156 +; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:148 +; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:140 +; VI-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:132 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:124 +; VI-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:116 +; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:108 +; VI-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:100 +; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:92 +; VI-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:84 +; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:76 +; VI-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:68 +; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:44 ; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:228 -; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:236 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:244 ; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:252 -; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:260 ; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:268 -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:276 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:316 +; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:300 ; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:284 ; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:292 -; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:300 ; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:308 -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:316 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:324 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -19848,11 +19864,11 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:312 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:320 ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:328 +; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:28 ; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:4 ; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:12 ; GFX9-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:28 -; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:36 ; GFX9-NEXT: s_waitcnt vmcnt(11) ; GFX9-NEXT: v_lshlrev_b32_e32 v60, 8, v5 ; GFX9-NEXT: s_waitcnt vmcnt(10) @@ -19864,32 +19880,33 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:44 -; GFX9-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:52 -; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:60 -; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:68 -; GFX9-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:76 -; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:84 -; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:92 -; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:100 -; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:108 -; GFX9-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:116 -; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:124 -; GFX9-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:132 -; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:140 -; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:148 -; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:156 -; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:164 +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:220 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:204 +; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:196 +; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:188 ; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:172 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:164 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:156 +; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:148 +; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:140 +; GFX9-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:132 +; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:124 +; GFX9-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:116 +; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:108 +; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:100 +; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:92 +; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:84 +; GFX9-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:68 +; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:52 +; GFX9-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:44 ; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:180 -; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:188 -; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:196 -; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:204 ; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:212 -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:220 ; GFX9-NEXT: s_waitcnt vmcnt(29) ; GFX9-NEXT: v_lshlrev_b32_e32 v42, 8, v3 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(22) ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:228 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -19900,10 +19917,10 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:244 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:252 -; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:260 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:252 +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:268 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -22598,9 +22615,9 @@ define <64 x bfloat> @bitcast_v32i32_to_v64bf16(<32 x i32> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; SI-NEXT: ; implicit-def: $vgpr60 @@ -22633,7 +22650,7 @@ define <64 x bfloat> @bitcast_v32i32_to_v64bf16(<32 x i32> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; kill: killed $vgpr31 @@ -22752,6 +22769,7 @@ define <64 x bfloat> @bitcast_v32i32_to_v64bf16(<32 x i32> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v19 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v62 ; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -22856,6 +22874,7 @@ define <64 x bfloat> @bitcast_v32i32_to_v64bf16(<32 x i32> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB16_4 ; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v62 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v63 @@ -31052,9 +31071,9 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; SI-NEXT: ; implicit-def: $vgpr60 @@ -31087,7 +31106,7 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; kill: killed $vgpr31 @@ -31160,6 +31179,7 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB20_2 ; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v62 ; SI-NEXT: v_cvt_f32_f16_e32 v32, v31 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -35016,8 +35036,8 @@ define <64 x i16> @bitcast_v32i32_to_v64i16(<32 x i32> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr60 @@ -35051,7 +35071,7 @@ define <64 x i16> @bitcast_v32i32_to_v64i16(<32 x i32> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; kill: killed $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -35102,6 +35122,7 @@ define <64 x i16> @bitcast_v32i32_to_v64i16(<32 x i32> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB24_4 ; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32 @@ -36348,12 +36369,12 @@ define <32 x i32> @bitcast_v64i16_to_v32i32(<64 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:4 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v31 @@ -37061,24 +37082,27 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 ; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:36 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:44 ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:52 ; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72 +; SI-NEXT: s_waitcnt expcnt(5) ; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:68 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:60 ; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v3 ; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v5 @@ -37098,31 +37122,32 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v55 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v40 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v36 -; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v33 -; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v50 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v49 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 ; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill @@ -37178,6 +37203,7 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: v_or_b32_e32 v21, v0, v21 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 ; SI-NEXT: v_or_b32_e32 v22, v0, v61 +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 ; SI-NEXT: v_or_b32_e32 v23, v0, v23 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 @@ -37248,6 +37274,7 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v35, v34 ; SI-NEXT: v_mov_b32_e32 v34, v54 ; SI-NEXT: v_mov_b32_e32 v54, v14 +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_mov_b32_e32 v33, v32 ; SI-NEXT: v_mov_b32_e32 v53, v63 ; SI-NEXT: v_mov_b32_e32 v62, v52 @@ -40049,8 +40076,8 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; kill: killed $vgpr36 @@ -40212,7 +40239,7 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; kill: killed $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -40450,6 +40477,7 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB36_4 ; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_f32_e32 v31, 1.0, v31 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_f32_e32 v32, 1.0, v32 @@ -41261,8 +41289,8 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: ; kill: killed $vgpr39 @@ -42230,8 +42258,8 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: ; kill: killed $vgpr40 @@ -42421,7 +42449,7 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(29) +; GFX9-NEXT: s_waitcnt vmcnt(30) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; kill: killed $vgpr33 @@ -42430,6 +42458,7 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB36_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_waitcnt vmcnt(29) ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v32 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v32 @@ -42625,6 +42654,7 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB36_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: s_waitcnt vmcnt(29) ; GFX9-NEXT: v_add_f32_e32 v32, 1.0, v32 ; GFX9-NEXT: s_waitcnt vmcnt(28) ; GFX9-NEXT: v_add_f32_e32 v31, 1.0, v31 @@ -49870,28 +49900,29 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v52 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:188 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:192 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:188 ; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:196 +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(5) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:196 +; SI-NEXT: s_waitcnt vmcnt(5) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(5) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:204 @@ -49901,28 +49932,29 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:212 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:220 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:224 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:220 ; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:228 +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(5) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:228 +; SI-NEXT: s_waitcnt vmcnt(5) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(5) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:236 @@ -49932,28 +49964,29 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:244 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:252 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:256 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:252 ; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:260 +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(5) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:260 +; SI-NEXT: s_waitcnt vmcnt(5) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(5) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:268 @@ -49963,28 +49996,29 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:284 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:288 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:312 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:284 ; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:292 +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(5) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:292 +; SI-NEXT: s_waitcnt vmcnt(5) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(5) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:300 @@ -49994,28 +50028,29 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:308 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:316 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:320 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:344 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:316 ; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:324 +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(5) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:324 +; SI-NEXT: s_waitcnt vmcnt(5) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(5) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:332 @@ -50025,26 +50060,28 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:340 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:348 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:352 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:360 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:368 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:376 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:348 ; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:356 +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(5) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:356 +; SI-NEXT: s_waitcnt vmcnt(5) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_lshlrev_b32_e32 v43, 8, v3 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:364 @@ -50054,12 +50091,12 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:372 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:384 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:384 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:44 ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:36 @@ -51386,25 +51423,27 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v56 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188 ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:192 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:200 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:208 ; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:216 -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188 ; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196 +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196 +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:204 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -51412,25 +51451,27 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:212 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:220 ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:224 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:232 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:240 ; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:248 -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:220 ; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:228 +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:228 +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:236 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -51438,25 +51479,27 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:244 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:252 ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:256 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:264 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:272 ; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:280 -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:252 ; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:260 +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:260 +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:268 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -51464,25 +51507,27 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:276 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:284 ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:288 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:296 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:304 ; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:312 -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:284 ; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:292 +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:292 +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:300 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -51490,25 +51535,27 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:308 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:316 ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:320 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:328 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:336 ; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:344 -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:316 ; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:324 +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:324 +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:332 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -51516,25 +51563,27 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:340 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:348 ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:352 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:360 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:368 ; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:376 -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:348 ; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:356 +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:356 +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -51542,11 +51591,9 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:372 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v63, 8, v0 -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44 ; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:36 @@ -51562,6 +51609,8 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68 ; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:60 ; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52 +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_lshlrev_b16_e32 v63, 8, v0 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -52619,26 +52668,28 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v56 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:192 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188 ; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:192 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:200 ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:208 ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:216 -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188 -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196 +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:204 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -52646,26 +52697,28 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:212 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:220 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:224 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:232 ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:240 ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:248 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:220 -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:228 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:228 +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:236 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -52673,26 +52726,28 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:244 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:252 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:256 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:264 ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:272 ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:280 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:252 -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:260 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:260 +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:268 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -52700,26 +52755,28 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:276 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:284 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:288 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:296 ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:304 ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:312 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:284 -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:292 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:292 +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:300 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -52727,26 +52784,28 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:308 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:316 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:320 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:328 ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:336 ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:344 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:316 -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:324 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:324 +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:332 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -52754,26 +52813,28 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:340 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:348 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:352 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:360 ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:368 ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:376 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:348 -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:356 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:356 +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -52781,11 +52842,10 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:372 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v63, 8, v0 +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v63, 8, v0 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44 ; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:36 @@ -53703,50 +53763,6 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_clause 0x1f -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:384 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:380 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:376 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:372 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v51, off, s32 offset:368 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:364 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:360 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:356 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:352 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:348 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:344 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:340 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v52, off, s32 offset:336 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:332 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v54, off, s32 offset:328 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:324 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:320 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:316 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v53, off, s32 offset:312 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:308 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:304 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:300 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:296 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:292 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:288 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:284 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:280 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:276 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:272 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:268 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:264 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v39, off, s32 offset:260 -; GFX11-TRUE16-NEXT: s_clause 0x1f -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:256 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:252 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:248 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v48, off, s32 offset:244 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:240 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:236 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v70, off, s32 offset:232 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v49, off, s32 offset:228 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:224 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:220 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v80, off, s32 offset:216 ; GFX11-TRUE16-NEXT: scratch_load_b32 v113, off, s32 offset:388 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v83, off, s32 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:8 @@ -53768,7 +53784,6 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v161, off, s32 offset:136 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v161, off, s32 offset:144 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v162, off, s32 offset:152 -; GFX11-TRUE16-NEXT: s_clause 0x1f ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v162, off, s32 offset:160 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v163, off, s32 offset:168 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v163, off, s32 offset:176 @@ -53776,6 +53791,51 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v164, off, s32 offset:192 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:200 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v165, off, s32 offset:208 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v80, off, s32 offset:216 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:224 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v70, off, s32 offset:232 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:240 +; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:248 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:256 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:264 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:272 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:280 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:288 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:296 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:304 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v53, off, s32 offset:312 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:320 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v54, off, s32 offset:328 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v52, off, s32 offset:336 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:344 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:352 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:360 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v51, off, s32 offset:368 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:376 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:384 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:380 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:372 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:364 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:356 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:348 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:340 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:332 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:324 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:316 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:308 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:300 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:292 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:284 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:276 +; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:268 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v39, off, s32 offset:260 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:252 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v48, off, s32 offset:244 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:236 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v49, off, s32 offset:228 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:220 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:212 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:204 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:196 @@ -53836,21 +53896,6 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.h, 8, v27.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.l, 8, v29.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(62) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v51.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v51.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(56) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v50.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v54.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v54.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(26) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v53.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v67.l, 8, v67.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.h, 8, v66.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.l, 8, v66.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.h, 8, v71.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.l, 8, v71.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(18) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.h, 8, v70.h ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v113 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(13) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.h, 8, v83.h @@ -53885,12 +53930,24 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v84.h, 8, v165.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.l, 8, v80.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.h, 8, v80.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.h, 8, v70.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.l, 8, v71.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.h, 8, v71.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v80.l, 8, v68.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v80.h, 8, v67.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.l, 8, v66.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.h, 8, v66.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v67.l, 8, v67.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v67.h, 8, v55.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.l, 8, v55.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v53.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v54.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v54.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v52.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.h, 8, v52.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v50.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v51.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v51.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v31.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v31.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 @@ -54488,50 +54545,6 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v52, v6 :: v_dual_mov_b32 v53, v4 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v54, v2 :: v_dual_mov_b32 v55, v0 ; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:384 -; GFX11-FAKE16-NEXT: scratch_load_u16 v64, off, s32 offset:380 -; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:376 -; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:372 -; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:368 -; GFX11-FAKE16-NEXT: scratch_load_u16 v66, off, s32 offset:364 -; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:360 -; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:356 -; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:352 -; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:348 -; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:344 -; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:340 -; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:336 -; GFX11-FAKE16-NEXT: scratch_load_u16 v70, off, s32 offset:332 -; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:328 -; GFX11-FAKE16-NEXT: scratch_load_u16 v71, off, s32 offset:324 -; GFX11-FAKE16-NEXT: scratch_load_u16 v16, off, s32 offset:320 -; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:316 -; GFX11-FAKE16-NEXT: scratch_load_u16 v18, off, s32 offset:312 -; GFX11-FAKE16-NEXT: scratch_load_u16 v81, off, s32 offset:308 -; GFX11-FAKE16-NEXT: scratch_load_u16 v20, off, s32 offset:304 -; GFX11-FAKE16-NEXT: scratch_load_u16 v82, off, s32 offset:300 -; GFX11-FAKE16-NEXT: scratch_load_u16 v22, off, s32 offset:296 -; GFX11-FAKE16-NEXT: scratch_load_u16 v83, off, s32 offset:292 -; GFX11-FAKE16-NEXT: scratch_load_u16 v24, off, s32 offset:288 -; GFX11-FAKE16-NEXT: scratch_load_u16 v84, off, s32 offset:284 -; GFX11-FAKE16-NEXT: scratch_load_u16 v26, off, s32 offset:280 -; GFX11-FAKE16-NEXT: scratch_load_u16 v85, off, s32 offset:276 -; GFX11-FAKE16-NEXT: scratch_load_u16 v28, off, s32 offset:272 -; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:268 -; GFX11-FAKE16-NEXT: scratch_load_u16 v30, off, s32 offset:264 -; GFX11-FAKE16-NEXT: scratch_load_u16 v87, off, s32 offset:260 -; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_load_u16 v31, off, s32 offset:256 -; GFX11-FAKE16-NEXT: scratch_load_u16 v96, off, s32 offset:252 -; GFX11-FAKE16-NEXT: scratch_load_u16 v99, off, s32 offset:248 -; GFX11-FAKE16-NEXT: scratch_load_u16 v97, off, s32 offset:244 -; GFX11-FAKE16-NEXT: scratch_load_u16 v100, off, s32 offset:240 -; GFX11-FAKE16-NEXT: scratch_load_u16 v98, off, s32 offset:236 -; GFX11-FAKE16-NEXT: scratch_load_u16 v101, off, s32 offset:232 -; GFX11-FAKE16-NEXT: scratch_load_u16 v112, off, s32 offset:228 -; GFX11-FAKE16-NEXT: scratch_load_u16 v102, off, s32 offset:224 -; GFX11-FAKE16-NEXT: scratch_load_u16 v113, off, s32 offset:220 -; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:216 ; GFX11-FAKE16-NEXT: scratch_load_b32 v114, off, s32 offset:388 ; GFX11-FAKE16-NEXT: scratch_load_u16 v115, off, s32 ; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:8 @@ -54553,7 +54566,6 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_load_u16 v165, off, s32 offset:136 ; GFX11-FAKE16-NEXT: scratch_load_u16 v166, off, s32 offset:144 ; GFX11-FAKE16-NEXT: scratch_load_u16 v179, off, s32 offset:152 -; GFX11-FAKE16-NEXT: s_clause 0x1f ; GFX11-FAKE16-NEXT: scratch_load_u16 v180, off, s32 offset:160 ; GFX11-FAKE16-NEXT: scratch_load_u16 v181, off, s32 offset:168 ; GFX11-FAKE16-NEXT: scratch_load_u16 v182, off, s32 offset:176 @@ -54561,6 +54573,51 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_load_u16 v136, off, s32 offset:192 ; GFX11-FAKE16-NEXT: scratch_load_u16 v137, off, s32 offset:200 ; GFX11-FAKE16-NEXT: scratch_load_u16 v138, off, s32 offset:208 +; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:216 +; GFX11-FAKE16-NEXT: scratch_load_u16 v102, off, s32 offset:224 +; GFX11-FAKE16-NEXT: scratch_load_u16 v101, off, s32 offset:232 +; GFX11-FAKE16-NEXT: scratch_load_u16 v100, off, s32 offset:240 +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_load_u16 v99, off, s32 offset:248 +; GFX11-FAKE16-NEXT: scratch_load_u16 v31, off, s32 offset:256 +; GFX11-FAKE16-NEXT: scratch_load_u16 v30, off, s32 offset:264 +; GFX11-FAKE16-NEXT: scratch_load_u16 v28, off, s32 offset:272 +; GFX11-FAKE16-NEXT: scratch_load_u16 v26, off, s32 offset:280 +; GFX11-FAKE16-NEXT: scratch_load_u16 v24, off, s32 offset:288 +; GFX11-FAKE16-NEXT: scratch_load_u16 v22, off, s32 offset:296 +; GFX11-FAKE16-NEXT: scratch_load_u16 v20, off, s32 offset:304 +; GFX11-FAKE16-NEXT: scratch_load_u16 v18, off, s32 offset:312 +; GFX11-FAKE16-NEXT: scratch_load_u16 v16, off, s32 offset:320 +; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:328 +; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:336 +; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:344 +; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:352 +; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:360 +; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:368 +; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:376 +; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:384 +; GFX11-FAKE16-NEXT: scratch_load_u16 v64, off, s32 offset:380 +; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:372 +; GFX11-FAKE16-NEXT: scratch_load_u16 v66, off, s32 offset:364 +; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:356 +; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:348 +; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:340 +; GFX11-FAKE16-NEXT: scratch_load_u16 v70, off, s32 offset:332 +; GFX11-FAKE16-NEXT: scratch_load_u16 v71, off, s32 offset:324 +; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:316 +; GFX11-FAKE16-NEXT: scratch_load_u16 v81, off, s32 offset:308 +; GFX11-FAKE16-NEXT: scratch_load_u16 v82, off, s32 offset:300 +; GFX11-FAKE16-NEXT: scratch_load_u16 v83, off, s32 offset:292 +; GFX11-FAKE16-NEXT: scratch_load_u16 v84, off, s32 offset:284 +; GFX11-FAKE16-NEXT: scratch_load_u16 v85, off, s32 offset:276 +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:268 +; GFX11-FAKE16-NEXT: scratch_load_u16 v87, off, s32 offset:260 +; GFX11-FAKE16-NEXT: scratch_load_u16 v96, off, s32 offset:252 +; GFX11-FAKE16-NEXT: scratch_load_u16 v97, off, s32 offset:244 +; GFX11-FAKE16-NEXT: scratch_load_u16 v98, off, s32 offset:236 +; GFX11-FAKE16-NEXT: scratch_load_u16 v112, off, s32 offset:228 +; GFX11-FAKE16-NEXT: scratch_load_u16 v113, off, s32 offset:220 ; GFX11-FAKE16-NEXT: scratch_load_u16 v132, off, s32 offset:212 ; GFX11-FAKE16-NEXT: scratch_load_u16 v133, off, s32 offset:204 ; GFX11-FAKE16-NEXT: scratch_load_u16 v134, off, s32 offset:196 @@ -54604,61 +54661,34 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v109, 8, v25 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v110, 8, v27 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v93, 8, v29 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(54) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(62) ; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v114 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(53) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v94, 8, v115 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(52) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v95, 8, v116 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(51) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v104, 8, v117 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(50) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v105, 8, v118 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(49) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v79, 8, v119 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(48) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v88, 8, v128 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(47) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v89, 8, v129 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(46) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v90, 8, v130 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(45) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v91, 8, v131 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(44) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v61, 8, v144 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(43) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v62, 8, v145 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(42) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v63, 8, v146 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(41) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v72, 8, v147 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(40) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v73, 8, v148 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(39) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v45, 8, v162 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(38) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v46, 8, v163 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(37) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v47, 8, v164 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(36) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v56, 8, v165 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(35) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v57, 8, v166 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(34) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v179, 8, v179 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(33) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v180, 8, v180 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(32) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v181, 8, v181 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(31) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v182, 8, v182 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(30) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v183, 8, v183 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(29) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v162, 8, v136 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(28) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v163, 8, v137 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(27) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v164, 8, v138 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v165, 8, v103 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v166, 8, v102 @@ -54668,19 +54698,33 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v147, 8, v31 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v148, 8, v30 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v119, 8, v28 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(61) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v128, 8, v26 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(60) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v129, 8, v24 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(59) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v130, 8, v22 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(58) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v131, 8, v20 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(57) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v114, 8, v18 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(56) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v115, 8, v16 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(55) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v116, 8, v14 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(54) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v117, 8, v12 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(53) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v118, 8, v10 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(52) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v99, 8, v8 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(51) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v100, 8, v6 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(50) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v101, 8, v4 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(49) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v102, 8, v2 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(48) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v103, 8, v0 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -55670,30 +55714,29 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:108 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:116 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:124 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:132 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:140 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:148 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:156 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:164 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:172 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:180 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:188 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:180 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:172 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:164 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:156 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:148 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:140 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:132 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:124 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:116 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:44 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v3 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:196 @@ -55707,14 +55750,14 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:212 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:220 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:244 ; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:228 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:220 ; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:236 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:244 ; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:252 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:268 @@ -56904,11 +56947,11 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:312 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:320 ; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:328 -; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:28 ; VI-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:4 ; VI-NEXT: s_waitcnt vmcnt(12) ; VI-NEXT: v_lshlrev_b32_e32 v31, 8, v5 ; VI-NEXT: s_waitcnt vmcnt(11) @@ -56923,44 +56966,44 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:52 -; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:60 -; VI-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:68 -; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:76 -; VI-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:84 -; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:92 -; VI-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:100 -; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:108 -; VI-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:116 -; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:124 -; VI-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:132 -; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:140 -; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:148 -; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:156 -; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:164 -; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:172 -; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:180 -; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:188 -; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:196 -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:204 -; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:212 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:276 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:260 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:244 +; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:236 ; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:220 +; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:212 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:204 +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:196 +; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:188 +; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:180 +; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:172 +; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:164 +; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:156 +; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:148 +; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:140 +; VI-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:132 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:124 +; VI-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:116 +; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:108 +; VI-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:100 +; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:92 +; VI-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:84 +; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:76 +; VI-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:68 +; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:44 ; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:228 -; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:236 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:244 ; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:252 -; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:260 ; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:268 -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:276 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:316 +; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:300 ; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:284 ; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:292 -; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:300 ; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:308 -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:316 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:324 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -57912,11 +57955,11 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:312 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:320 ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:328 +; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:28 ; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:4 ; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:12 ; GFX9-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:28 -; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:36 ; GFX9-NEXT: s_waitcnt vmcnt(11) ; GFX9-NEXT: v_lshlrev_b32_e32 v60, 8, v5 ; GFX9-NEXT: s_waitcnt vmcnt(10) @@ -57928,32 +57971,33 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:44 -; GFX9-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:52 -; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:60 -; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:68 -; GFX9-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:76 -; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:84 -; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:92 -; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:100 -; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:108 -; GFX9-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:116 -; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:124 -; GFX9-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:132 -; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:140 -; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:148 -; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:156 -; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:164 +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:220 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:204 +; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:196 +; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:188 ; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:172 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:164 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:156 +; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:148 +; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:140 +; GFX9-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:132 +; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:124 +; GFX9-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:116 +; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:108 +; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:100 +; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:92 +; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:84 +; GFX9-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:68 +; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:52 +; GFX9-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:44 ; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:180 -; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:188 -; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:196 -; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:204 ; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:212 -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:220 ; GFX9-NEXT: s_waitcnt vmcnt(29) ; GFX9-NEXT: v_lshlrev_b32_e32 v42, 8, v3 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(22) ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:228 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -57964,10 +58008,10 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:244 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:252 -; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:260 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:252 +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:268 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -60662,9 +60706,9 @@ define <64 x bfloat> @bitcast_v32f32_to_v64bf16(<32 x float> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; SI-NEXT: ; implicit-def: $vgpr60 @@ -60697,7 +60741,7 @@ define <64 x bfloat> @bitcast_v32f32_to_v64bf16(<32 x float> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; kill: killed $vgpr31 @@ -60816,6 +60860,7 @@ define <64 x bfloat> @bitcast_v32f32_to_v64bf16(<32 x float> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v19 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v62 ; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -60920,6 +60965,7 @@ define <64 x bfloat> @bitcast_v32f32_to_v64bf16(<32 x float> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB40_4 ; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_f32_e32 v32, 1.0, v62 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_f32_e32 v31, 1.0, v63 @@ -69162,9 +69208,9 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; SI-NEXT: ; implicit-def: $vgpr60 @@ -69197,7 +69243,7 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; kill: killed $vgpr31 @@ -69270,6 +69316,7 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB44_2 ; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v62 ; SI-NEXT: v_cvt_f32_f16_e32 v32, v31 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -73097,8 +73144,8 @@ define <64 x i16> @bitcast_v32f32_to_v64i16(<32 x float> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr60 @@ -73132,7 +73179,7 @@ define <64 x i16> @bitcast_v32f32_to_v64i16(<32 x float> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; kill: killed $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -73183,6 +73230,7 @@ define <64 x i16> @bitcast_v32f32_to_v64i16(<32 x float> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB48_4 ; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_f32_e32 v31, 1.0, v31 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_f32_e32 v32, 1.0, v32 @@ -74383,12 +74431,12 @@ define <32 x float> @bitcast_v64i16_to_v32f32(<64 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:4 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v31 @@ -75096,24 +75144,27 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a, ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 ; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:36 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:44 ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:52 ; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72 +; SI-NEXT: s_waitcnt expcnt(5) ; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:68 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:60 ; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v3 ; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v5 @@ -75133,31 +75184,32 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a, ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v55 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v40 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v36 -; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v33 -; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v50 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v49 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 ; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill @@ -75213,6 +75265,7 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a, ; SI-NEXT: v_or_b32_e32 v21, v0, v21 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 ; SI-NEXT: v_or_b32_e32 v22, v0, v61 +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 ; SI-NEXT: v_or_b32_e32 v23, v0, v23 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 @@ -75283,6 +75336,7 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a, ; SI-NEXT: v_mov_b32_e32 v35, v34 ; SI-NEXT: v_mov_b32_e32 v34, v54 ; SI-NEXT: v_mov_b32_e32 v54, v14 +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_mov_b32_e32 v33, v32 ; SI-NEXT: v_mov_b32_e32 v53, v63 ; SI-NEXT: v_mov_b32_e32 v62, v52 @@ -77070,8 +77124,8 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; kill: killed $vgpr36 @@ -77233,7 +77287,7 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; kill: killed $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -78282,8 +78336,8 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: ; kill: killed $vgpr39 @@ -79251,8 +79305,8 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: ; kill: killed $vgpr40 @@ -79442,7 +79496,7 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(29) +; GFX9-NEXT: s_waitcnt vmcnt(30) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; kill: killed $vgpr33 @@ -79451,6 +79505,7 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB56_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_waitcnt vmcnt(29) ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v32 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v32 @@ -85773,28 +85828,29 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v52 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:188 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:192 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:188 ; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:196 +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(5) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:196 +; SI-NEXT: s_waitcnt vmcnt(5) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(5) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:204 @@ -85804,28 +85860,29 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:212 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:220 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:224 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:220 ; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:228 +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(5) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:228 +; SI-NEXT: s_waitcnt vmcnt(5) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(5) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:236 @@ -85835,28 +85892,29 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:244 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:252 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:256 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:252 ; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:260 +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(5) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:260 +; SI-NEXT: s_waitcnt vmcnt(5) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(5) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:268 @@ -85866,28 +85924,29 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:284 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:288 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:312 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:284 ; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:292 +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(5) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:292 +; SI-NEXT: s_waitcnt vmcnt(5) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(5) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:300 @@ -85897,28 +85956,29 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:308 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:316 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:320 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:344 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:316 ; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:324 +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(5) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:324 +; SI-NEXT: s_waitcnt vmcnt(5) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(5) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:332 @@ -85928,26 +85988,28 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:340 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:348 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:352 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:360 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:368 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:376 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:348 ; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:356 +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(5) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:356 +; SI-NEXT: s_waitcnt vmcnt(5) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_lshlrev_b32_e32 v43, 8, v3 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:364 @@ -85957,12 +86019,12 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:372 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:384 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:384 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:44 ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:36 @@ -87289,25 +87351,27 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v56 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188 ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:192 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:200 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:208 ; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:216 -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188 ; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196 +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196 +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:204 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -87315,25 +87379,27 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:212 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:220 ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:224 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:232 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:240 ; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:248 -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:220 ; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:228 +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:228 +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:236 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -87341,25 +87407,27 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:244 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:252 ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:256 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:264 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:272 ; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:280 -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:252 ; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:260 +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:260 +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:268 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -87367,25 +87435,27 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:276 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:284 ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:288 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:296 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:304 ; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:312 -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:284 ; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:292 +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:292 +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:300 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -87393,25 +87463,27 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:308 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:316 ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:320 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:328 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:336 ; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:344 -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:316 ; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:324 +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:324 +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:332 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -87419,25 +87491,27 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:340 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:348 ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:352 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:360 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:368 ; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:376 -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:348 ; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:356 +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:356 +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -87445,11 +87519,9 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:372 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v63, 8, v0 -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44 ; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:36 @@ -87465,6 +87537,8 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68 ; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:60 ; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52 +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_lshlrev_b16_e32 v63, 8, v0 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -88522,26 +88596,28 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v56 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:192 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188 ; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:192 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:200 ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:208 ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:216 -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188 -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196 +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:204 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -88549,26 +88625,28 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:212 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:220 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:224 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:232 ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:240 ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:248 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:220 -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:228 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:228 +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:236 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -88576,26 +88654,28 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:244 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:252 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:256 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:264 ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:272 ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:280 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:252 -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:260 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:260 +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:268 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -88603,26 +88683,28 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:276 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:284 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:288 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:296 ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:304 ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:312 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:284 -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:292 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:292 +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:300 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -88630,26 +88712,28 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:308 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:316 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:320 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:328 ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:336 ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:344 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:316 -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:324 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:324 +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:332 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -88657,26 +88741,28 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:340 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:348 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:352 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:360 ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:368 ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:376 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:348 -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:356 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:356 +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -88684,11 +88770,10 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:372 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v63, 8, v0 +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v63, 8, v0 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44 ; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:36 @@ -89606,50 +89691,6 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_clause 0x1f -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:384 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:380 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:376 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:372 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v51, off, s32 offset:368 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:364 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:360 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:356 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:352 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:348 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:344 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:340 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v52, off, s32 offset:336 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:332 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v54, off, s32 offset:328 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:324 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:320 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:316 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v53, off, s32 offset:312 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:308 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:304 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:300 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:296 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:292 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:288 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:284 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:280 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:276 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:272 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:268 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:264 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v39, off, s32 offset:260 -; GFX11-TRUE16-NEXT: s_clause 0x1f -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:256 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:252 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:248 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v48, off, s32 offset:244 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:240 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:236 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v70, off, s32 offset:232 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v49, off, s32 offset:228 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:224 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:220 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v80, off, s32 offset:216 ; GFX11-TRUE16-NEXT: scratch_load_b32 v113, off, s32 offset:388 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v83, off, s32 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:8 @@ -89671,7 +89712,6 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v161, off, s32 offset:136 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v161, off, s32 offset:144 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v162, off, s32 offset:152 -; GFX11-TRUE16-NEXT: s_clause 0x1f ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v162, off, s32 offset:160 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v163, off, s32 offset:168 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v163, off, s32 offset:176 @@ -89679,6 +89719,51 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v164, off, s32 offset:192 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:200 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v165, off, s32 offset:208 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v80, off, s32 offset:216 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:224 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v70, off, s32 offset:232 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:240 +; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:248 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:256 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:264 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:272 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:280 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:288 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:296 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:304 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v53, off, s32 offset:312 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:320 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v54, off, s32 offset:328 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v52, off, s32 offset:336 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:344 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:352 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:360 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v51, off, s32 offset:368 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:376 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:384 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:380 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:372 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:364 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:356 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:348 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:340 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:332 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:324 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:316 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:308 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:300 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:292 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:284 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:276 +; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:268 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v39, off, s32 offset:260 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:252 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v48, off, s32 offset:244 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:236 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v49, off, s32 offset:228 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:220 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:212 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:204 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:196 @@ -89739,21 +89824,6 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.h, 8, v27.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.l, 8, v29.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(62) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v51.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v51.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(56) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v50.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v54.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v54.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(26) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v53.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v67.l, 8, v67.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.h, 8, v66.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.l, 8, v66.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.h, 8, v71.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.l, 8, v71.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(18) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.h, 8, v70.h ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v113 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(13) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.h, 8, v83.h @@ -89788,12 +89858,24 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v84.h, 8, v165.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.l, 8, v80.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.h, 8, v80.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.h, 8, v70.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.l, 8, v71.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.h, 8, v71.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v80.l, 8, v68.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v80.h, 8, v67.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.l, 8, v66.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.h, 8, v66.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v67.l, 8, v67.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v67.h, 8, v55.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.l, 8, v55.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v53.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v54.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v54.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v52.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.h, 8, v52.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v50.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v51.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v51.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v31.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v31.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 @@ -90391,50 +90473,6 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v52, v6 :: v_dual_mov_b32 v53, v4 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v54, v2 :: v_dual_mov_b32 v55, v0 ; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:384 -; GFX11-FAKE16-NEXT: scratch_load_u16 v64, off, s32 offset:380 -; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:376 -; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:372 -; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:368 -; GFX11-FAKE16-NEXT: scratch_load_u16 v66, off, s32 offset:364 -; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:360 -; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:356 -; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:352 -; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:348 -; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:344 -; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:340 -; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:336 -; GFX11-FAKE16-NEXT: scratch_load_u16 v70, off, s32 offset:332 -; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:328 -; GFX11-FAKE16-NEXT: scratch_load_u16 v71, off, s32 offset:324 -; GFX11-FAKE16-NEXT: scratch_load_u16 v16, off, s32 offset:320 -; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:316 -; GFX11-FAKE16-NEXT: scratch_load_u16 v18, off, s32 offset:312 -; GFX11-FAKE16-NEXT: scratch_load_u16 v81, off, s32 offset:308 -; GFX11-FAKE16-NEXT: scratch_load_u16 v20, off, s32 offset:304 -; GFX11-FAKE16-NEXT: scratch_load_u16 v82, off, s32 offset:300 -; GFX11-FAKE16-NEXT: scratch_load_u16 v22, off, s32 offset:296 -; GFX11-FAKE16-NEXT: scratch_load_u16 v83, off, s32 offset:292 -; GFX11-FAKE16-NEXT: scratch_load_u16 v24, off, s32 offset:288 -; GFX11-FAKE16-NEXT: scratch_load_u16 v84, off, s32 offset:284 -; GFX11-FAKE16-NEXT: scratch_load_u16 v26, off, s32 offset:280 -; GFX11-FAKE16-NEXT: scratch_load_u16 v85, off, s32 offset:276 -; GFX11-FAKE16-NEXT: scratch_load_u16 v28, off, s32 offset:272 -; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:268 -; GFX11-FAKE16-NEXT: scratch_load_u16 v30, off, s32 offset:264 -; GFX11-FAKE16-NEXT: scratch_load_u16 v87, off, s32 offset:260 -; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_load_u16 v31, off, s32 offset:256 -; GFX11-FAKE16-NEXT: scratch_load_u16 v96, off, s32 offset:252 -; GFX11-FAKE16-NEXT: scratch_load_u16 v99, off, s32 offset:248 -; GFX11-FAKE16-NEXT: scratch_load_u16 v97, off, s32 offset:244 -; GFX11-FAKE16-NEXT: scratch_load_u16 v100, off, s32 offset:240 -; GFX11-FAKE16-NEXT: scratch_load_u16 v98, off, s32 offset:236 -; GFX11-FAKE16-NEXT: scratch_load_u16 v101, off, s32 offset:232 -; GFX11-FAKE16-NEXT: scratch_load_u16 v112, off, s32 offset:228 -; GFX11-FAKE16-NEXT: scratch_load_u16 v102, off, s32 offset:224 -; GFX11-FAKE16-NEXT: scratch_load_u16 v113, off, s32 offset:220 -; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:216 ; GFX11-FAKE16-NEXT: scratch_load_b32 v114, off, s32 offset:388 ; GFX11-FAKE16-NEXT: scratch_load_u16 v115, off, s32 ; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:8 @@ -90456,7 +90494,6 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_load_u16 v165, off, s32 offset:136 ; GFX11-FAKE16-NEXT: scratch_load_u16 v166, off, s32 offset:144 ; GFX11-FAKE16-NEXT: scratch_load_u16 v179, off, s32 offset:152 -; GFX11-FAKE16-NEXT: s_clause 0x1f ; GFX11-FAKE16-NEXT: scratch_load_u16 v180, off, s32 offset:160 ; GFX11-FAKE16-NEXT: scratch_load_u16 v181, off, s32 offset:168 ; GFX11-FAKE16-NEXT: scratch_load_u16 v182, off, s32 offset:176 @@ -90464,6 +90501,51 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_load_u16 v136, off, s32 offset:192 ; GFX11-FAKE16-NEXT: scratch_load_u16 v137, off, s32 offset:200 ; GFX11-FAKE16-NEXT: scratch_load_u16 v138, off, s32 offset:208 +; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:216 +; GFX11-FAKE16-NEXT: scratch_load_u16 v102, off, s32 offset:224 +; GFX11-FAKE16-NEXT: scratch_load_u16 v101, off, s32 offset:232 +; GFX11-FAKE16-NEXT: scratch_load_u16 v100, off, s32 offset:240 +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_load_u16 v99, off, s32 offset:248 +; GFX11-FAKE16-NEXT: scratch_load_u16 v31, off, s32 offset:256 +; GFX11-FAKE16-NEXT: scratch_load_u16 v30, off, s32 offset:264 +; GFX11-FAKE16-NEXT: scratch_load_u16 v28, off, s32 offset:272 +; GFX11-FAKE16-NEXT: scratch_load_u16 v26, off, s32 offset:280 +; GFX11-FAKE16-NEXT: scratch_load_u16 v24, off, s32 offset:288 +; GFX11-FAKE16-NEXT: scratch_load_u16 v22, off, s32 offset:296 +; GFX11-FAKE16-NEXT: scratch_load_u16 v20, off, s32 offset:304 +; GFX11-FAKE16-NEXT: scratch_load_u16 v18, off, s32 offset:312 +; GFX11-FAKE16-NEXT: scratch_load_u16 v16, off, s32 offset:320 +; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:328 +; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:336 +; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:344 +; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:352 +; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:360 +; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:368 +; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:376 +; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:384 +; GFX11-FAKE16-NEXT: scratch_load_u16 v64, off, s32 offset:380 +; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:372 +; GFX11-FAKE16-NEXT: scratch_load_u16 v66, off, s32 offset:364 +; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:356 +; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:348 +; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:340 +; GFX11-FAKE16-NEXT: scratch_load_u16 v70, off, s32 offset:332 +; GFX11-FAKE16-NEXT: scratch_load_u16 v71, off, s32 offset:324 +; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:316 +; GFX11-FAKE16-NEXT: scratch_load_u16 v81, off, s32 offset:308 +; GFX11-FAKE16-NEXT: scratch_load_u16 v82, off, s32 offset:300 +; GFX11-FAKE16-NEXT: scratch_load_u16 v83, off, s32 offset:292 +; GFX11-FAKE16-NEXT: scratch_load_u16 v84, off, s32 offset:284 +; GFX11-FAKE16-NEXT: scratch_load_u16 v85, off, s32 offset:276 +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:268 +; GFX11-FAKE16-NEXT: scratch_load_u16 v87, off, s32 offset:260 +; GFX11-FAKE16-NEXT: scratch_load_u16 v96, off, s32 offset:252 +; GFX11-FAKE16-NEXT: scratch_load_u16 v97, off, s32 offset:244 +; GFX11-FAKE16-NEXT: scratch_load_u16 v98, off, s32 offset:236 +; GFX11-FAKE16-NEXT: scratch_load_u16 v112, off, s32 offset:228 +; GFX11-FAKE16-NEXT: scratch_load_u16 v113, off, s32 offset:220 ; GFX11-FAKE16-NEXT: scratch_load_u16 v132, off, s32 offset:212 ; GFX11-FAKE16-NEXT: scratch_load_u16 v133, off, s32 offset:204 ; GFX11-FAKE16-NEXT: scratch_load_u16 v134, off, s32 offset:196 @@ -90507,61 +90589,34 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v109, 8, v25 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v110, 8, v27 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v93, 8, v29 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(54) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(62) ; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v114 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(53) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v94, 8, v115 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(52) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v95, 8, v116 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(51) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v104, 8, v117 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(50) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v105, 8, v118 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(49) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v79, 8, v119 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(48) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v88, 8, v128 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(47) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v89, 8, v129 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(46) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v90, 8, v130 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(45) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v91, 8, v131 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(44) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v61, 8, v144 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(43) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v62, 8, v145 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(42) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v63, 8, v146 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(41) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v72, 8, v147 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(40) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v73, 8, v148 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(39) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v45, 8, v162 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(38) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v46, 8, v163 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(37) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v47, 8, v164 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(36) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v56, 8, v165 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(35) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v57, 8, v166 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(34) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v179, 8, v179 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(33) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v180, 8, v180 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(32) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v181, 8, v181 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(31) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v182, 8, v182 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(30) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v183, 8, v183 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(29) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v162, 8, v136 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(28) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v163, 8, v137 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(27) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v164, 8, v138 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v165, 8, v103 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v166, 8, v102 @@ -90571,19 +90626,33 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v147, 8, v31 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v148, 8, v30 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v119, 8, v28 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(61) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v128, 8, v26 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(60) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v129, 8, v24 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(59) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v130, 8, v22 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(58) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v131, 8, v20 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(57) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v114, 8, v18 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(56) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v115, 8, v16 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(55) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v116, 8, v14 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(54) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v117, 8, v12 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(53) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v118, 8, v10 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(52) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v99, 8, v8 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(51) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v100, 8, v6 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(50) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v101, 8, v4 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(49) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v102, 8, v2 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(48) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v103, 8, v0 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -91573,30 +91642,29 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:108 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:116 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:124 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:132 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:140 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:148 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:156 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:164 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:172 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:180 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:188 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:180 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:172 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:164 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:156 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:148 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:140 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:132 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:124 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:116 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:44 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v3 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:196 @@ -91610,14 +91678,14 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:212 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:220 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:244 ; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:228 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:220 ; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:236 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:244 ; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:252 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:268 @@ -92807,11 +92875,11 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:312 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:320 ; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:328 -; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:28 ; VI-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:4 ; VI-NEXT: s_waitcnt vmcnt(12) ; VI-NEXT: v_lshlrev_b32_e32 v31, 8, v5 ; VI-NEXT: s_waitcnt vmcnt(11) @@ -92826,44 +92894,44 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:52 -; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:60 -; VI-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:68 -; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:76 -; VI-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:84 -; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:92 -; VI-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:100 -; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:108 -; VI-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:116 -; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:124 -; VI-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:132 -; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:140 -; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:148 -; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:156 -; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:164 -; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:172 -; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:180 -; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:188 -; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:196 -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:204 -; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:212 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:276 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:260 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:244 +; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:236 ; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:220 +; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:212 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:204 +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:196 +; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:188 +; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:180 +; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:172 +; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:164 +; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:156 +; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:148 +; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:140 +; VI-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:132 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:124 +; VI-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:116 +; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:108 +; VI-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:100 +; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:92 +; VI-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:84 +; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:76 +; VI-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:68 +; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:44 ; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:228 -; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:236 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:244 ; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:252 -; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:260 ; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:268 -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:276 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:316 +; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:300 ; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:284 ; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:292 -; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:300 ; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:308 -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:316 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:324 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -93815,11 +93883,11 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:312 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:320 ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:328 +; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:28 ; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:4 ; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:12 ; GFX9-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:28 -; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:36 ; GFX9-NEXT: s_waitcnt vmcnt(11) ; GFX9-NEXT: v_lshlrev_b32_e32 v60, 8, v5 ; GFX9-NEXT: s_waitcnt vmcnt(10) @@ -93831,32 +93899,33 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:44 -; GFX9-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:52 -; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:60 -; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:68 -; GFX9-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:76 -; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:84 -; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:92 -; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:100 -; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:108 -; GFX9-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:116 -; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:124 -; GFX9-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:132 -; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:140 -; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:148 -; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:156 -; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:164 +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:220 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:204 +; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:196 +; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:188 ; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:172 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:164 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:156 +; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:148 +; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:140 +; GFX9-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:132 +; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:124 +; GFX9-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:116 +; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:108 +; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:100 +; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:92 +; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:84 +; GFX9-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:68 +; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:52 +; GFX9-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:44 ; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:180 -; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:188 -; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:196 -; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:204 ; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:212 -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:220 ; GFX9-NEXT: s_waitcnt vmcnt(29) ; GFX9-NEXT: v_lshlrev_b32_e32 v42, 8, v3 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(22) ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:228 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -93867,10 +93936,10 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:244 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:252 -; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:260 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:252 +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:268 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -96565,9 +96634,9 @@ define <64 x bfloat> @bitcast_v16i64_to_v64bf16(<16 x i64> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; SI-NEXT: ; implicit-def: $vgpr60 @@ -96600,7 +96669,7 @@ define <64 x bfloat> @bitcast_v16i64_to_v64bf16(<16 x i64> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; kill: killed $vgpr31 @@ -96719,6 +96788,7 @@ define <64 x bfloat> @bitcast_v16i64_to_v64bf16(<16 x i64> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v19 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v62 ; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -105007,9 +105077,9 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; SI-NEXT: ; implicit-def: $vgpr60 @@ -105042,7 +105112,7 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; kill: killed $vgpr31 @@ -105115,6 +105185,7 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB64_2 ; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v62 ; SI-NEXT: v_cvt_f32_f16_e32 v32, v31 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -108984,8 +109055,8 @@ define <64 x i16> @bitcast_v16i64_to_v64i16(<16 x i64> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr60 @@ -109019,7 +109090,7 @@ define <64 x i16> @bitcast_v16i64_to_v64i16(<16 x i64> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; kill: killed $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -110330,12 +110401,12 @@ define <16 x i64> @bitcast_v64i16_to_v16i64(<64 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:4 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v31 @@ -111043,24 +111114,27 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 ; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:36 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:44 ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:52 ; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72 +; SI-NEXT: s_waitcnt expcnt(5) ; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:68 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:60 ; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v3 ; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v5 @@ -111080,31 +111154,32 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v55 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v40 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v36 -; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v33 -; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v50 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v49 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 ; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill @@ -111160,6 +111235,7 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: v_or_b32_e32 v21, v0, v21 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 ; SI-NEXT: v_or_b32_e32 v22, v0, v61 +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 ; SI-NEXT: v_or_b32_e32 v23, v0, v23 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 @@ -111230,6 +111306,7 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v35, v34 ; SI-NEXT: v_mov_b32_e32 v34, v54 ; SI-NEXT: v_mov_b32_e32 v54, v14 +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_mov_b32_e32 v33, v32 ; SI-NEXT: v_mov_b32_e32 v53, v63 ; SI-NEXT: v_mov_b32_e32 v62, v52 @@ -112048,8 +112125,8 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; kill: killed $vgpr36 @@ -112211,7 +112288,7 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; kill: killed $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -113244,8 +113321,8 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: ; kill: killed $vgpr39 @@ -114200,8 +114277,8 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX9-NEXT: ; implicit-def: $vgpr41 ; GFX9-NEXT: ; kill: killed $vgpr41 @@ -114395,13 +114472,14 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: s_waitcnt vmcnt(32) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB72_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_waitcnt vmcnt(31) ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v32 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v32 @@ -121812,28 +121890,29 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v52 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:188 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:192 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:188 ; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:196 +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(5) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:196 +; SI-NEXT: s_waitcnt vmcnt(5) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(5) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:204 @@ -121843,28 +121922,29 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:212 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:220 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:224 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:220 ; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:228 +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(5) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:228 +; SI-NEXT: s_waitcnt vmcnt(5) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(5) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:236 @@ -121874,28 +121954,29 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:244 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:252 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:256 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:252 ; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:260 +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(5) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:260 +; SI-NEXT: s_waitcnt vmcnt(5) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(5) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:268 @@ -121905,28 +121986,29 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:284 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:288 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:312 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:284 ; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:292 +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(5) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:292 +; SI-NEXT: s_waitcnt vmcnt(5) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(5) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:300 @@ -121936,28 +122018,29 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:308 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:316 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:320 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:344 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:316 ; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:324 +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(5) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:324 +; SI-NEXT: s_waitcnt vmcnt(5) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(5) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:332 @@ -121967,26 +122050,28 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:340 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:348 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:352 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:360 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:368 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:376 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:348 ; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:356 +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(5) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:356 +; SI-NEXT: s_waitcnt vmcnt(5) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_lshlrev_b32_e32 v43, 8, v3 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:364 @@ -121996,12 +122081,12 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:372 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:384 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:384 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:44 ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:36 @@ -123328,25 +123413,27 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v56 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188 ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:192 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:200 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:208 ; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:216 -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188 ; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196 +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196 +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:204 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -123354,25 +123441,27 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:212 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:220 ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:224 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:232 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:240 ; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:248 -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:220 ; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:228 +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:228 +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:236 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -123380,25 +123469,27 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:244 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:252 ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:256 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:264 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:272 ; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:280 -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:252 ; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:260 +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:260 +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:268 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -123406,25 +123497,27 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:276 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:284 ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:288 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:296 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:304 ; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:312 -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:284 ; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:292 +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:292 +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:300 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -123432,25 +123525,27 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:308 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:316 ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:320 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:328 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:336 ; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:344 -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:316 ; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:324 +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:324 +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:332 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -123458,25 +123553,27 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:340 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:348 ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:352 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:360 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:368 ; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:376 -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:348 ; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:356 +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:356 +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -123484,11 +123581,9 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:372 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v63, 8, v0 -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44 ; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:36 @@ -123504,6 +123599,8 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68 ; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:60 ; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52 +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_lshlrev_b16_e32 v63, 8, v0 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -124561,26 +124658,28 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v56 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:192 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188 ; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:192 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:200 ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:208 ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:216 -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188 -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196 +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:204 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -124588,26 +124687,28 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:212 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:220 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:224 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:232 ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:240 ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:248 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:220 -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:228 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:228 +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:236 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -124615,26 +124716,28 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:244 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:252 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:256 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:264 ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:272 ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:280 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:252 -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:260 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:260 +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:268 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -124642,26 +124745,28 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:276 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:284 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:288 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:296 ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:304 ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:312 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:284 -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:292 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:292 +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:300 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -124669,26 +124774,28 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:308 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:316 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:320 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:328 ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:336 ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:344 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:316 -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:324 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:324 +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:332 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -124696,26 +124803,28 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:340 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:348 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:352 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:360 ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:368 ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:376 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:348 -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:356 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:356 +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -124723,11 +124832,10 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:372 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v63, 8, v0 +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v63, 8, v0 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44 ; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:36 @@ -125645,50 +125753,6 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_clause 0x1f -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:384 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:380 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:376 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:372 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v51, off, s32 offset:368 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:364 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:360 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:356 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:352 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:348 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:344 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:340 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v52, off, s32 offset:336 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:332 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v54, off, s32 offset:328 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:324 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:320 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:316 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v53, off, s32 offset:312 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:308 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:304 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:300 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:296 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:292 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:288 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:284 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:280 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:276 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:272 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:268 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:264 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v39, off, s32 offset:260 -; GFX11-TRUE16-NEXT: s_clause 0x1f -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:256 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:252 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:248 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v48, off, s32 offset:244 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:240 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:236 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v70, off, s32 offset:232 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v49, off, s32 offset:228 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:224 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:220 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v80, off, s32 offset:216 ; GFX11-TRUE16-NEXT: scratch_load_b32 v113, off, s32 offset:388 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v83, off, s32 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:8 @@ -125710,7 +125774,6 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v161, off, s32 offset:136 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v161, off, s32 offset:144 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v162, off, s32 offset:152 -; GFX11-TRUE16-NEXT: s_clause 0x1f ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v162, off, s32 offset:160 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v163, off, s32 offset:168 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v163, off, s32 offset:176 @@ -125718,6 +125781,51 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v164, off, s32 offset:192 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:200 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v165, off, s32 offset:208 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v80, off, s32 offset:216 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:224 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v70, off, s32 offset:232 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:240 +; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:248 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:256 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:264 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:272 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:280 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:288 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:296 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:304 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v53, off, s32 offset:312 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:320 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v54, off, s32 offset:328 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v52, off, s32 offset:336 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:344 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:352 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:360 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v51, off, s32 offset:368 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:376 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:384 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:380 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:372 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:364 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:356 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:348 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:340 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:332 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:324 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:316 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:308 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:300 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:292 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:284 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:276 +; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:268 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v39, off, s32 offset:260 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:252 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v48, off, s32 offset:244 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:236 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v49, off, s32 offset:228 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:220 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:212 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:204 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:196 @@ -125778,21 +125886,6 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.h, 8, v27.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.l, 8, v29.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(62) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v51.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v51.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(56) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v50.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v54.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v54.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(26) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v53.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v67.l, 8, v67.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.h, 8, v66.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.l, 8, v66.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.h, 8, v71.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.l, 8, v71.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(18) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.h, 8, v70.h ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v113 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(13) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.h, 8, v83.h @@ -125827,12 +125920,24 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v84.h, 8, v165.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.l, 8, v80.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.h, 8, v80.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.h, 8, v70.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.l, 8, v71.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.h, 8, v71.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v80.l, 8, v68.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v80.h, 8, v67.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.l, 8, v66.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.h, 8, v66.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v67.l, 8, v67.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v67.h, 8, v55.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.l, 8, v55.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v53.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v54.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v54.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v52.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.h, 8, v52.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v50.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v51.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v51.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v31.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v31.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 @@ -126430,50 +126535,6 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v52, v6 :: v_dual_mov_b32 v53, v4 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v54, v2 :: v_dual_mov_b32 v55, v0 ; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:384 -; GFX11-FAKE16-NEXT: scratch_load_u16 v64, off, s32 offset:380 -; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:376 -; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:372 -; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:368 -; GFX11-FAKE16-NEXT: scratch_load_u16 v66, off, s32 offset:364 -; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:360 -; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:356 -; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:352 -; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:348 -; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:344 -; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:340 -; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:336 -; GFX11-FAKE16-NEXT: scratch_load_u16 v70, off, s32 offset:332 -; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:328 -; GFX11-FAKE16-NEXT: scratch_load_u16 v71, off, s32 offset:324 -; GFX11-FAKE16-NEXT: scratch_load_u16 v16, off, s32 offset:320 -; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:316 -; GFX11-FAKE16-NEXT: scratch_load_u16 v18, off, s32 offset:312 -; GFX11-FAKE16-NEXT: scratch_load_u16 v81, off, s32 offset:308 -; GFX11-FAKE16-NEXT: scratch_load_u16 v20, off, s32 offset:304 -; GFX11-FAKE16-NEXT: scratch_load_u16 v82, off, s32 offset:300 -; GFX11-FAKE16-NEXT: scratch_load_u16 v22, off, s32 offset:296 -; GFX11-FAKE16-NEXT: scratch_load_u16 v83, off, s32 offset:292 -; GFX11-FAKE16-NEXT: scratch_load_u16 v24, off, s32 offset:288 -; GFX11-FAKE16-NEXT: scratch_load_u16 v84, off, s32 offset:284 -; GFX11-FAKE16-NEXT: scratch_load_u16 v26, off, s32 offset:280 -; GFX11-FAKE16-NEXT: scratch_load_u16 v85, off, s32 offset:276 -; GFX11-FAKE16-NEXT: scratch_load_u16 v28, off, s32 offset:272 -; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:268 -; GFX11-FAKE16-NEXT: scratch_load_u16 v30, off, s32 offset:264 -; GFX11-FAKE16-NEXT: scratch_load_u16 v87, off, s32 offset:260 -; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_load_u16 v31, off, s32 offset:256 -; GFX11-FAKE16-NEXT: scratch_load_u16 v96, off, s32 offset:252 -; GFX11-FAKE16-NEXT: scratch_load_u16 v99, off, s32 offset:248 -; GFX11-FAKE16-NEXT: scratch_load_u16 v97, off, s32 offset:244 -; GFX11-FAKE16-NEXT: scratch_load_u16 v100, off, s32 offset:240 -; GFX11-FAKE16-NEXT: scratch_load_u16 v98, off, s32 offset:236 -; GFX11-FAKE16-NEXT: scratch_load_u16 v101, off, s32 offset:232 -; GFX11-FAKE16-NEXT: scratch_load_u16 v112, off, s32 offset:228 -; GFX11-FAKE16-NEXT: scratch_load_u16 v102, off, s32 offset:224 -; GFX11-FAKE16-NEXT: scratch_load_u16 v113, off, s32 offset:220 -; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:216 ; GFX11-FAKE16-NEXT: scratch_load_b32 v114, off, s32 offset:388 ; GFX11-FAKE16-NEXT: scratch_load_u16 v115, off, s32 ; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:8 @@ -126495,7 +126556,6 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_load_u16 v165, off, s32 offset:136 ; GFX11-FAKE16-NEXT: scratch_load_u16 v166, off, s32 offset:144 ; GFX11-FAKE16-NEXT: scratch_load_u16 v179, off, s32 offset:152 -; GFX11-FAKE16-NEXT: s_clause 0x1f ; GFX11-FAKE16-NEXT: scratch_load_u16 v180, off, s32 offset:160 ; GFX11-FAKE16-NEXT: scratch_load_u16 v181, off, s32 offset:168 ; GFX11-FAKE16-NEXT: scratch_load_u16 v182, off, s32 offset:176 @@ -126503,6 +126563,51 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_load_u16 v136, off, s32 offset:192 ; GFX11-FAKE16-NEXT: scratch_load_u16 v137, off, s32 offset:200 ; GFX11-FAKE16-NEXT: scratch_load_u16 v138, off, s32 offset:208 +; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:216 +; GFX11-FAKE16-NEXT: scratch_load_u16 v102, off, s32 offset:224 +; GFX11-FAKE16-NEXT: scratch_load_u16 v101, off, s32 offset:232 +; GFX11-FAKE16-NEXT: scratch_load_u16 v100, off, s32 offset:240 +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_load_u16 v99, off, s32 offset:248 +; GFX11-FAKE16-NEXT: scratch_load_u16 v31, off, s32 offset:256 +; GFX11-FAKE16-NEXT: scratch_load_u16 v30, off, s32 offset:264 +; GFX11-FAKE16-NEXT: scratch_load_u16 v28, off, s32 offset:272 +; GFX11-FAKE16-NEXT: scratch_load_u16 v26, off, s32 offset:280 +; GFX11-FAKE16-NEXT: scratch_load_u16 v24, off, s32 offset:288 +; GFX11-FAKE16-NEXT: scratch_load_u16 v22, off, s32 offset:296 +; GFX11-FAKE16-NEXT: scratch_load_u16 v20, off, s32 offset:304 +; GFX11-FAKE16-NEXT: scratch_load_u16 v18, off, s32 offset:312 +; GFX11-FAKE16-NEXT: scratch_load_u16 v16, off, s32 offset:320 +; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:328 +; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:336 +; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:344 +; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:352 +; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:360 +; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:368 +; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:376 +; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:384 +; GFX11-FAKE16-NEXT: scratch_load_u16 v64, off, s32 offset:380 +; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:372 +; GFX11-FAKE16-NEXT: scratch_load_u16 v66, off, s32 offset:364 +; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:356 +; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:348 +; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:340 +; GFX11-FAKE16-NEXT: scratch_load_u16 v70, off, s32 offset:332 +; GFX11-FAKE16-NEXT: scratch_load_u16 v71, off, s32 offset:324 +; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:316 +; GFX11-FAKE16-NEXT: scratch_load_u16 v81, off, s32 offset:308 +; GFX11-FAKE16-NEXT: scratch_load_u16 v82, off, s32 offset:300 +; GFX11-FAKE16-NEXT: scratch_load_u16 v83, off, s32 offset:292 +; GFX11-FAKE16-NEXT: scratch_load_u16 v84, off, s32 offset:284 +; GFX11-FAKE16-NEXT: scratch_load_u16 v85, off, s32 offset:276 +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:268 +; GFX11-FAKE16-NEXT: scratch_load_u16 v87, off, s32 offset:260 +; GFX11-FAKE16-NEXT: scratch_load_u16 v96, off, s32 offset:252 +; GFX11-FAKE16-NEXT: scratch_load_u16 v97, off, s32 offset:244 +; GFX11-FAKE16-NEXT: scratch_load_u16 v98, off, s32 offset:236 +; GFX11-FAKE16-NEXT: scratch_load_u16 v112, off, s32 offset:228 +; GFX11-FAKE16-NEXT: scratch_load_u16 v113, off, s32 offset:220 ; GFX11-FAKE16-NEXT: scratch_load_u16 v132, off, s32 offset:212 ; GFX11-FAKE16-NEXT: scratch_load_u16 v133, off, s32 offset:204 ; GFX11-FAKE16-NEXT: scratch_load_u16 v134, off, s32 offset:196 @@ -126546,61 +126651,34 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v109, 8, v25 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v110, 8, v27 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v93, 8, v29 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(54) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(62) ; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v114 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(53) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v94, 8, v115 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(52) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v95, 8, v116 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(51) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v104, 8, v117 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(50) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v105, 8, v118 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(49) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v79, 8, v119 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(48) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v88, 8, v128 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(47) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v89, 8, v129 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(46) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v90, 8, v130 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(45) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v91, 8, v131 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(44) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v61, 8, v144 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(43) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v62, 8, v145 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(42) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v63, 8, v146 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(41) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v72, 8, v147 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(40) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v73, 8, v148 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(39) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v45, 8, v162 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(38) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v46, 8, v163 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(37) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v47, 8, v164 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(36) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v56, 8, v165 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(35) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v57, 8, v166 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(34) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v179, 8, v179 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(33) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v180, 8, v180 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(32) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v181, 8, v181 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(31) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v182, 8, v182 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(30) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v183, 8, v183 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(29) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v162, 8, v136 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(28) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v163, 8, v137 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(27) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v164, 8, v138 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v165, 8, v103 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v166, 8, v102 @@ -126610,19 +126688,33 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v147, 8, v31 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v148, 8, v30 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v119, 8, v28 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(61) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v128, 8, v26 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(60) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v129, 8, v24 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(59) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v130, 8, v22 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(58) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v131, 8, v20 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(57) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v114, 8, v18 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(56) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v115, 8, v16 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(55) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v116, 8, v14 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(54) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v117, 8, v12 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(53) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v118, 8, v10 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(52) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v99, 8, v8 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(51) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v100, 8, v6 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(50) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v101, 8, v4 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(49) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v102, 8, v2 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(48) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v103, 8, v0 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -127612,30 +127704,29 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:108 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:116 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:124 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:132 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:140 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:148 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:156 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:164 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:172 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:180 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:188 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:180 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:172 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:164 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:156 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:148 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:140 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:132 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:124 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:116 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:44 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v3 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:196 @@ -127649,14 +127740,14 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:212 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:220 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:244 ; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:228 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:220 ; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:236 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:244 ; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:252 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:268 @@ -128846,11 +128937,11 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:312 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:320 ; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:328 -; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:28 ; VI-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:4 ; VI-NEXT: s_waitcnt vmcnt(12) ; VI-NEXT: v_lshlrev_b32_e32 v31, 8, v5 ; VI-NEXT: s_waitcnt vmcnt(11) @@ -128865,44 +128956,44 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:52 -; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:60 -; VI-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:68 -; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:76 -; VI-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:84 -; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:92 -; VI-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:100 -; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:108 -; VI-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:116 -; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:124 -; VI-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:132 -; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:140 -; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:148 -; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:156 -; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:164 -; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:172 -; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:180 -; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:188 -; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:196 -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:204 -; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:212 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:276 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:260 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:244 +; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:236 ; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:220 +; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:212 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:204 +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:196 +; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:188 +; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:180 +; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:172 +; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:164 +; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:156 +; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:148 +; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:140 +; VI-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:132 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:124 +; VI-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:116 +; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:108 +; VI-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:100 +; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:92 +; VI-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:84 +; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:76 +; VI-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:68 +; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:44 ; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:228 -; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:236 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:244 ; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:252 -; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:260 ; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:268 -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:276 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:316 +; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:300 ; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:284 ; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:292 -; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:300 ; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:308 -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:316 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:324 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -129854,11 +129945,11 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:312 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:320 ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:328 +; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:28 ; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:4 ; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:12 ; GFX9-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:28 -; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:36 ; GFX9-NEXT: s_waitcnt vmcnt(11) ; GFX9-NEXT: v_lshlrev_b32_e32 v60, 8, v5 ; GFX9-NEXT: s_waitcnt vmcnt(10) @@ -129870,32 +129961,33 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:44 -; GFX9-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:52 -; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:60 -; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:68 -; GFX9-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:76 -; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:84 -; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:92 -; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:100 -; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:108 -; GFX9-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:116 -; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:124 -; GFX9-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:132 -; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:140 -; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:148 -; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:156 -; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:164 +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:220 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:204 +; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:196 +; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:188 ; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:172 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:164 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:156 +; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:148 +; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:140 +; GFX9-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:132 +; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:124 +; GFX9-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:116 +; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:108 +; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:100 +; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:92 +; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:84 +; GFX9-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:68 +; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:52 +; GFX9-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:44 ; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:180 -; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:188 -; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:196 -; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:204 ; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:212 -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:220 ; GFX9-NEXT: s_waitcnt vmcnt(29) ; GFX9-NEXT: v_lshlrev_b32_e32 v42, 8, v3 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(22) ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:228 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -129906,10 +129998,10 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:244 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:252 -; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:260 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:252 +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:268 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -132604,8 +132696,8 @@ define <64 x bfloat> @bitcast_v16f64_to_v64bf16(<16 x double> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; kill: killed $vgpr35 @@ -132703,13 +132795,14 @@ define <64 x bfloat> @bitcast_v16f64_to_v64bf16(<16 x double> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; kill: killed $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB76_2 ; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v32 ; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 ; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill @@ -140994,8 +141087,8 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; kill: killed $vgpr35 @@ -141093,7 +141186,7 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; kill: killed $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -141143,6 +141236,7 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v8, v52 +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v32 ; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v31 @@ -144851,8 +144945,8 @@ define <64 x i16> @bitcast_v16f64_to_v64i16(<16 x double> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr60 @@ -144886,7 +144980,7 @@ define <64 x i16> @bitcast_v16f64_to_v64i16(<16 x double> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; kill: killed $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -146041,12 +146135,12 @@ define <16 x double> @bitcast_v64i16_to_v16f64(<64 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:4 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v31 @@ -146754,24 +146848,27 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a, ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 ; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:36 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:44 ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:52 ; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72 +; SI-NEXT: s_waitcnt expcnt(5) ; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:68 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:60 ; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v3 ; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v5 @@ -146791,31 +146888,32 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a, ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v55 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v40 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v36 -; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v33 -; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v50 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v49 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 ; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill @@ -146871,6 +146969,7 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a, ; SI-NEXT: v_or_b32_e32 v21, v0, v21 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 ; SI-NEXT: v_or_b32_e32 v22, v0, v61 +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 ; SI-NEXT: v_or_b32_e32 v23, v0, v23 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 @@ -146941,6 +147040,7 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a, ; SI-NEXT: v_mov_b32_e32 v35, v34 ; SI-NEXT: v_mov_b32_e32 v34, v54 ; SI-NEXT: v_mov_b32_e32 v54, v14 +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_mov_b32_e32 v33, v32 ; SI-NEXT: v_mov_b32_e32 v53, v63 ; SI-NEXT: v_mov_b32_e32 v62, v52 @@ -147778,6 +147878,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 ; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:392 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 @@ -147798,7 +147899,6 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:164 ; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:180 ; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:188 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 ; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -147847,7 +147947,9 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:144 ; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -147874,10 +147976,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v13 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:144 ; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v15 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -147895,6 +147994,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v25 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 ; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr9 @@ -147904,7 +148004,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 -; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:140 @@ -147954,24 +148054,24 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:192 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:212 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:192 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(3) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v2 -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:208 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v2 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v3 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:204 @@ -147981,22 +148081,23 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:200 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:224 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:224 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v2 -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:240 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v2 +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v26, 8, v3 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:236 @@ -148006,21 +148107,21 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:232 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:256 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:256 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v31, 24, v1 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v33, 24, v2 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v38, 8, v3 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:272 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v31, 24, v1 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v33, 24, v2 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v38, 8, v3 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:268 @@ -148096,6 +148197,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:76 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 ; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:152 @@ -148106,8 +148208,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:312 ; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:344 ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:376 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:76 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:72 @@ -150032,25 +150133,27 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v44 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188 ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:192 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:200 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:208 ; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:216 -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188 ; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196 +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196 +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:204 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -150058,25 +150161,27 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:212 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:220 ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:224 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:232 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:240 ; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:248 -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:220 ; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:228 +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:228 +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:236 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -150084,25 +150189,27 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:244 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:252 ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:256 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:264 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:272 ; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:280 -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:252 ; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:260 +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:260 +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:268 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -150110,25 +150217,27 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:276 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:284 ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:288 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:296 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:304 ; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:312 -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:284 ; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:292 +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:292 +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:300 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -150136,25 +150245,27 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:308 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:316 ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:320 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:328 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:336 ; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:344 -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:316 ; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:324 +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:324 +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:332 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -150162,23 +150273,23 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:340 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:348 ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:352 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:360 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:368 ; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:376 -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:348 ; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:356 +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v38, 8, v0 -; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v39, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_lshlrev_b16_e32 v49, 8, v2 -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v51, 8, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:356 -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -150186,11 +150297,9 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:372 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v53, 8, v0 -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:44 ; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:36 @@ -150206,6 +150315,8 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:68 ; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:60 ; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:52 +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_lshlrev_b16_e32 v53, 8, v0 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -151274,26 +151385,28 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v44 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:192 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188 ; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:192 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:200 ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:208 ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:216 -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188 -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196 +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:204 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -151301,26 +151414,28 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:212 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:220 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:224 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:232 ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:240 ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:248 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:220 -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:228 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:228 +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:236 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -151328,26 +151443,28 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:244 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:252 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:256 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:264 ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:272 ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:280 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:252 -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:260 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:260 +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:268 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -151355,26 +151472,28 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:276 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:284 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:288 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:296 ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:304 ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:312 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:284 -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:292 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:292 +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:300 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -151382,26 +151501,28 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:308 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:316 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:320 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:328 ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:336 ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:344 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:316 -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:324 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:324 +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:332 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -151409,24 +151530,23 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:340 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:348 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:352 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:360 ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:368 ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:376 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:348 -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_lshlrev_b16_e32 v37, 8, v0 ; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_lshlrev_b16_e32 v37, 8, v0 +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:356 +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v49, 8, v1 -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_lshlrev_b16_e32 v48, 8, v2 -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v52, 8, v3 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:356 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -151434,11 +151554,10 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:372 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v53, 8, v0 +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v53, 8, v0 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44 ; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:36 @@ -152361,50 +152480,6 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_clause 0x1f -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:384 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v39, off, s32 offset:380 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:376 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:372 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v150, off, s32 offset:368 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:364 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v150, off, s32 offset:360 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:356 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v147, off, s32 offset:352 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:348 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v149, off, s32 offset:344 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:340 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v149, off, s32 offset:336 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:332 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v148, off, s32 offset:328 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v48, off, s32 offset:324 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v145, off, s32 offset:320 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:316 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v148, off, s32 offset:312 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:308 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v147, off, s32 offset:304 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:300 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v146, off, s32 offset:296 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:292 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v146, off, s32 offset:288 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:284 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v145, off, s32 offset:280 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:276 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v134, off, s32 offset:272 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:268 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v144, off, s32 offset:264 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:260 -; GFX11-TRUE16-NEXT: s_clause 0x1f -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v144, off, s32 offset:256 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:252 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v135, off, s32 offset:248 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:244 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v132, off, s32 offset:240 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:236 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v135, off, s32 offset:232 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:228 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v134, off, s32 offset:224 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:220 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v133, off, s32 offset:216 ; GFX11-TRUE16-NEXT: scratch_load_b32 v160, off, s32 offset:388 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v103, off, s32 offset:8 @@ -152426,7 +152501,6 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v129, off, s32 offset:136 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v129, off, s32 offset:144 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v130, off, s32 offset:152 -; GFX11-TRUE16-NEXT: s_clause 0x1f ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v130, off, s32 offset:160 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v131, off, s32 offset:168 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v131, off, s32 offset:176 @@ -152434,6 +152508,51 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v133, off, s32 offset:192 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v151, off, s32 offset:200 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v151, off, s32 offset:208 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v133, off, s32 offset:216 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v134, off, s32 offset:224 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v135, off, s32 offset:232 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v132, off, s32 offset:240 +; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v135, off, s32 offset:248 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v144, off, s32 offset:256 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v144, off, s32 offset:264 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v134, off, s32 offset:272 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v145, off, s32 offset:280 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v146, off, s32 offset:288 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v146, off, s32 offset:296 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v147, off, s32 offset:304 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v148, off, s32 offset:312 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v145, off, s32 offset:320 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v148, off, s32 offset:328 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v149, off, s32 offset:336 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v149, off, s32 offset:344 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v147, off, s32 offset:352 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v150, off, s32 offset:360 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v150, off, s32 offset:368 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:376 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:384 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v39, off, s32 offset:380 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:372 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:364 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:356 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:348 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:340 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:332 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v48, off, s32 offset:324 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:316 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:308 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:300 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:292 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:284 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:276 +; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:268 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:260 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:252 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:244 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:236 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:228 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:220 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v100, off, s32 offset:212 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:204 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:196 @@ -152495,26 +152614,6 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.l, 8, v27.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.l, 8, v29.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.h, 8, v150.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.l, 8, v150.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.h, 8, v147.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v149.h, 8, v149.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v149.l, 8, v149.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.h, 8, v148.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v145.l, 8, v145.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.l, 8, v148.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.l, 8, v147.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v146.h, 8, v146.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v146.l, 8, v146.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v145.h, 8, v145.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v134.h, 8, v134.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.h, 8, v144.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.l, 8, v144.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.h, 8, v135.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.l, 8, v132.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.l, 8, v135.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v134.l, 8, v134.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.h, 8, v133.h ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v160 ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v101.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.h, 8, v103.h @@ -152543,6 +152642,26 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.h, 8, v133.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.h, 8, v151.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.l, 8, v151.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.h, 8, v133.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v134.l, 8, v134.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.l, 8, v135.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.l, 8, v132.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.h, 8, v135.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.l, 8, v144.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.h, 8, v144.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v134.h, 8, v134.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v145.h, 8, v145.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v146.l, 8, v146.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v146.h, 8, v146.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.l, 8, v147.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.l, 8, v148.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v145.l, 8, v145.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.h, 8, v148.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v149.l, 8, v149.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v149.h, 8, v149.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.h, 8, v147.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.l, 8, v150.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.h, 8, v150.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.l, 8, v31.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.h, 8, v31.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 @@ -153137,50 +153256,6 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v48, v8 :: v_dual_mov_b32 v33, v6 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v35, v2 :: v_dual_mov_b32 v38, v0 ; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:384 -; GFX11-FAKE16-NEXT: scratch_load_u16 v118, off, s32 offset:380 -; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:376 -; GFX11-FAKE16-NEXT: scratch_load_u16 v134, off, s32 offset:372 -; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:368 -; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:364 -; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:360 -; GFX11-FAKE16-NEXT: scratch_load_u16 v131, off, s32 offset:356 -; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:352 -; GFX11-FAKE16-NEXT: scratch_load_u16 v98, off, s32 offset:348 -; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:344 -; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:340 -; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:336 -; GFX11-FAKE16-NEXT: scratch_load_u16 v99, off, s32 offset:332 -; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:328 -; GFX11-FAKE16-NEXT: scratch_load_u16 v112, off, s32 offset:324 -; GFX11-FAKE16-NEXT: scratch_load_u16 v16, off, s32 offset:320 -; GFX11-FAKE16-NEXT: scratch_load_u16 v81, off, s32 offset:316 -; GFX11-FAKE16-NEXT: scratch_load_u16 v18, off, s32 offset:312 -; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:308 -; GFX11-FAKE16-NEXT: scratch_load_u16 v20, off, s32 offset:304 -; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:300 -; GFX11-FAKE16-NEXT: scratch_load_u16 v22, off, s32 offset:296 -; GFX11-FAKE16-NEXT: scratch_load_u16 v101, off, s32 offset:292 -; GFX11-FAKE16-NEXT: scratch_load_u16 v24, off, s32 offset:288 -; GFX11-FAKE16-NEXT: scratch_load_u16 v83, off, s32 offset:284 -; GFX11-FAKE16-NEXT: scratch_load_u16 v26, off, s32 offset:280 -; GFX11-FAKE16-NEXT: scratch_load_u16 v97, off, s32 offset:276 -; GFX11-FAKE16-NEXT: scratch_load_u16 v28, off, s32 offset:272 -; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:268 -; GFX11-FAKE16-NEXT: scratch_load_u16 v30, off, s32 offset:264 -; GFX11-FAKE16-NEXT: scratch_load_u16 v85, off, s32 offset:260 -; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_load_u16 v31, off, s32 offset:256 -; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:252 -; GFX11-FAKE16-NEXT: scratch_load_u16 v94, off, s32 offset:248 -; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:244 -; GFX11-FAKE16-NEXT: scratch_load_u16 v88, off, s32 offset:240 -; GFX11-FAKE16-NEXT: scratch_load_u16 v64, off, s32 offset:236 -; GFX11-FAKE16-NEXT: scratch_load_u16 v93, off, s32 offset:232 -; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:228 -; GFX11-FAKE16-NEXT: scratch_load_u16 v91, off, s32 offset:224 -; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:220 -; GFX11-FAKE16-NEXT: scratch_load_u16 v92, off, s32 offset:216 ; GFX11-FAKE16-NEXT: scratch_load_b32 v150, off, s32 offset:388 ; GFX11-FAKE16-NEXT: scratch_load_u16 v182, off, s32 ; GFX11-FAKE16-NEXT: scratch_load_u16 v40, off, s32 offset:8 @@ -153202,7 +153277,6 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_load_u16 v75, off, s32 offset:136 ; GFX11-FAKE16-NEXT: scratch_load_u16 v76, off, s32 offset:144 ; GFX11-FAKE16-NEXT: scratch_load_u16 v77, off, s32 offset:152 -; GFX11-FAKE16-NEXT: s_clause 0x1f ; GFX11-FAKE16-NEXT: scratch_load_u16 v78, off, s32 offset:160 ; GFX11-FAKE16-NEXT: scratch_load_u16 v79, off, s32 offset:168 ; GFX11-FAKE16-NEXT: scratch_load_u16 v89, off, s32 offset:176 @@ -153210,6 +153284,51 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_load_u16 v95, off, s32 offset:192 ; GFX11-FAKE16-NEXT: scratch_load_u16 v104, off, s32 offset:200 ; GFX11-FAKE16-NEXT: scratch_load_u16 v105, off, s32 offset:208 +; GFX11-FAKE16-NEXT: scratch_load_u16 v92, off, s32 offset:216 +; GFX11-FAKE16-NEXT: scratch_load_u16 v91, off, s32 offset:224 +; GFX11-FAKE16-NEXT: scratch_load_u16 v93, off, s32 offset:232 +; GFX11-FAKE16-NEXT: scratch_load_u16 v88, off, s32 offset:240 +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_load_u16 v94, off, s32 offset:248 +; GFX11-FAKE16-NEXT: scratch_load_u16 v31, off, s32 offset:256 +; GFX11-FAKE16-NEXT: scratch_load_u16 v30, off, s32 offset:264 +; GFX11-FAKE16-NEXT: scratch_load_u16 v28, off, s32 offset:272 +; GFX11-FAKE16-NEXT: scratch_load_u16 v26, off, s32 offset:280 +; GFX11-FAKE16-NEXT: scratch_load_u16 v24, off, s32 offset:288 +; GFX11-FAKE16-NEXT: scratch_load_u16 v22, off, s32 offset:296 +; GFX11-FAKE16-NEXT: scratch_load_u16 v20, off, s32 offset:304 +; GFX11-FAKE16-NEXT: scratch_load_u16 v18, off, s32 offset:312 +; GFX11-FAKE16-NEXT: scratch_load_u16 v16, off, s32 offset:320 +; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:328 +; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:336 +; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:344 +; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:352 +; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:360 +; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:368 +; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:376 +; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:384 +; GFX11-FAKE16-NEXT: scratch_load_u16 v118, off, s32 offset:380 +; GFX11-FAKE16-NEXT: scratch_load_u16 v134, off, s32 offset:372 +; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:364 +; GFX11-FAKE16-NEXT: scratch_load_u16 v131, off, s32 offset:356 +; GFX11-FAKE16-NEXT: scratch_load_u16 v98, off, s32 offset:348 +; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:340 +; GFX11-FAKE16-NEXT: scratch_load_u16 v99, off, s32 offset:332 +; GFX11-FAKE16-NEXT: scratch_load_u16 v112, off, s32 offset:324 +; GFX11-FAKE16-NEXT: scratch_load_u16 v81, off, s32 offset:316 +; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:308 +; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:300 +; GFX11-FAKE16-NEXT: scratch_load_u16 v101, off, s32 offset:292 +; GFX11-FAKE16-NEXT: scratch_load_u16 v83, off, s32 offset:284 +; GFX11-FAKE16-NEXT: scratch_load_u16 v97, off, s32 offset:276 +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:268 +; GFX11-FAKE16-NEXT: scratch_load_u16 v85, off, s32 offset:260 +; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:252 +; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:244 +; GFX11-FAKE16-NEXT: scratch_load_u16 v64, off, s32 offset:236 +; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:228 +; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:220 ; GFX11-FAKE16-NEXT: scratch_load_u16 v42, off, s32 offset:212 ; GFX11-FAKE16-NEXT: scratch_load_u16 v180, off, s32 offset:204 ; GFX11-FAKE16-NEXT: scratch_load_u16 v183, off, s32 offset:196 @@ -153254,89 +153373,71 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v167, 8, v27 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v181, 8, v29 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(62) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v127, 8, v0 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v126, 8, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v124, 8, v4 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v125, 8, v6 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v120, 8, v8 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v123, 8, v10 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v121, 8, v12 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v122, 8, v14 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v106, 8, v16 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v111, 8, v18 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v109, 8, v20 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v110, 8, v22 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v107, 8, v24 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v108, 8, v26 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(61) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v88, 8, v88 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(59) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v93, 8, v93 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(57) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v91, 8, v91 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(55) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v92, 8, v92 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(54) ; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v150 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(53) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v150, 8, v182 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(52) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v41, 8, v40 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(51) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v40, 8, v43 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(50) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v43, 8, v44 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(49) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v182, 8, v45 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(48) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v46, 8, v46 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(47) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v45, 8, v47 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(46) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v57, 8, v56 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(45) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v56, 8, v58 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(44) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v58, 8, v59 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(43) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v44, 8, v60 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(42) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v60, 8, v61 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(41) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v59, 8, v62 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(40) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v62, 8, v63 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(39) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v47, 8, v72 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(38) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v72, 8, v73 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(37) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v63, 8, v74 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(36) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v74, 8, v75 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(35) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v73, 8, v76 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(34) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v75, 8, v77 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(33) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v61, 8, v78 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(32) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v78, 8, v79 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(31) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v77, 8, v89 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(30) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v79, 8, v90 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(29) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v76, 8, v95 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(28) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v90, 8, v104 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(27) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v89, 8, v105 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v92, 8, v92 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v91, 8, v91 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v93, 8, v93 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v88, 8, v88 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v104, 8, v94 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v95, 8, v31 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v105, 8, v30 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v94, 8, v28 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(61) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v108, 8, v26 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(60) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v107, 8, v24 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(59) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v110, 8, v22 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(58) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v109, 8, v20 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(57) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v111, 8, v18 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(56) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v106, 8, v16 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(55) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v122, 8, v14 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(54) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v121, 8, v12 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(53) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v123, 8, v10 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(52) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v120, 8, v8 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(51) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v125, 8, v6 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(50) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v124, 8, v4 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(49) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v126, 8, v2 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(48) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v127, 8, v0 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -153636,10 +153737,13 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB88_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(46) ; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v134, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v118, 3 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(44) ; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v131, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v116, 3 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(42) ; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v129, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 @@ -153656,10 +153760,12 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_add_nc_u16 v116, 0x300, v1 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v30, 0x300, v2 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v98, 0x300, v3 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(40) ; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v112, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v4 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v99, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(38) ; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v103, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v123, v2 @@ -153674,10 +153780,12 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_add_nc_u16 v28, 0x300, v1 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v81, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v81, 0x300, v2 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(36) ; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v101, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v27, 0x300, v0 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v86, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(34) ; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v97, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v83, 3 @@ -153692,14 +153800,17 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_add_nc_u16 v26, 0x300, v2 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v107, v4 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v86, 0x300, v0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(32) ; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v85, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v25, 0x300, v1 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v67, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v67, 0x300, v2 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(30) ; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v80, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v68, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(28) ; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v69, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v105, v0 @@ -155959,13 +156070,13 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:68 ; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:44 ; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:52 ; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:60 -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:68 ; VI-NEXT: s_waitcnt vmcnt(10) ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:76 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -155998,19 +156109,19 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:204 ; VI-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:212 ; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:220 +; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:244 ; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:228 +; VI-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:260 ; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:236 -; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:244 ; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:252 -; VI-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:260 ; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:268 ; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:276 ; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:284 ; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:292 ; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:300 ; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:308 -; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:316 ; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:324 +; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:316 ; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill @@ -156987,11 +157098,11 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:320 ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:328 -; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:28 ; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(14) ; GFX9-NEXT: v_lshlrev_b32_e32 v39, 8, v7 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5 @@ -156999,100 +157110,108 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v1 -; GFX9-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:44 -; GFX9-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:52 -; GFX9-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:60 -; GFX9-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:68 -; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:108 ; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:84 -; GFX9-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:92 +; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:76 ; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:100 -; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:108 +; GFX9-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:92 +; GFX9-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:68 +; GFX9-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:52 +; GFX9-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:44 ; GFX9-NEXT: s_waitcnt vmcnt(14) ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(8) ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:132 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:116 ; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:124 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:132 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:140 -; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:148 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:140 +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:156 ; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:164 -; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:172 ; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:180 ; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:188 -; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:196 -; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:204 ; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:212 -; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:220 +; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:204 ; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:228 -; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:236 +; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:220 ; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:244 -; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:252 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:236 ; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:260 -; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:268 +; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:252 ; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:276 -; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:284 +; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:268 ; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:292 -; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:300 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:284 ; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:308 -; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:316 +; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:300 ; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:324 +; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:316 +; GFX9-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:156 +; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:172 +; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:196 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: s_waitcnt vmcnt(24) ; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: s_waitcnt vmcnt(24) ; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: s_waitcnt vmcnt(25) ; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(24) +; GFX9-NEXT: s_waitcnt vmcnt(28) ; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(28) ; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(28) +; GFX9-NEXT: s_waitcnt vmcnt(32) ; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(33) ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: s_waitcnt vmcnt(35) ; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(36) ; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: s_waitcnt vmcnt(38) ; GFX9-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(38) ; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: s_waitcnt vmcnt(38) ; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(38) ; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: s_waitcnt vmcnt(38) ; GFX9-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(39) ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(35) +; GFX9-NEXT: s_waitcnt vmcnt(39) ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(39) ; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(35) +; GFX9-NEXT: s_waitcnt vmcnt(39) ; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(39) ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill @@ -157380,6 +157499,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_branch .LBB89_3 ; GFX9-NEXT: .LBB89_2: +; GFX9-NEXT: s_waitcnt vmcnt(55) ; GFX9-NEXT: v_mov_b32_e32 v58, v50 ; GFX9-NEXT: v_mov_b32_e32 v45, v59 ; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload @@ -161522,8 +161642,8 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; VI-NEXT: ; implicit-def: $vgpr35 ; VI-NEXT: ; implicit-def: $vgpr45 @@ -163074,8 +163194,8 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:4 ; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:4 ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; kill: killed $vgpr33 @@ -163273,15 +163393,15 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(33) +; GFX9-NEXT: s_waitcnt vmcnt(34) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 ; GFX9-NEXT: ; implicit-def: $vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB90_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v4 ; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v8 ; GFX9-NEXT: v_lshrrev_b32_e32 v32, 24, v6 @@ -163310,20 +163430,21 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v60, 8, v18 ; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v17 ; GFX9-NEXT: v_lshrrev_b32_e32 v43, 8, v17 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v16 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v16 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v16 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v15 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v15 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v14 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v4 @@ -163364,6 +163485,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v27 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v13 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v27 @@ -163722,6 +163844,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 ; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc ; GFX9-NEXT: v_perm_b32 v33, v17, v24, s7 +; GFX9-NEXT: s_waitcnt vmcnt(53) ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v63 ; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 ; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 @@ -163755,8 +163878,8 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 ; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; GFX9-NEXT: v_perm_b32 v35, v17, v23, s7 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v2 ; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 @@ -163789,8 +163912,8 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v37, v1, v22, s7 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v4 ; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 ; GFX9-NEXT: v_add3_u32 v2, v2, v1, s6 ; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v1 @@ -163934,7 +164057,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 ; GFX9-NEXT: v_cndmask_b32_e32 v11, v31, v32, vcc ; GFX9-NEXT: v_perm_b32 v54, v11, v1, s7 -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v14 ; GFX9-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 ; GFX9-NEXT: v_bfe_u32 v31, v11, 16, 1 @@ -163949,6 +164072,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v14 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 ; GFX9-NEXT: v_cndmask_b32_e32 v14, v31, v32, vcc +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b32_e32 v31, 16, v13 ; GFX9-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 ; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 @@ -163965,7 +164089,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 ; GFX9-NEXT: v_cndmask_b32_e32 v13, v31, v32, vcc ; GFX9-NEXT: v_perm_b32 v41, v13, v0, s7 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v16 ; GFX9-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 ; GFX9-NEXT: v_bfe_u32 v31, v13, 16, 1 @@ -163980,6 +164104,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v16 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 ; GFX9-NEXT: v_cndmask_b32_e32 v16, v31, v32, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v31, 16, v15 ; GFX9-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 ; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 @@ -164261,8 +164386,8 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; GFX9-NEXT: v_mov_b32_e32 v63, v16 ; GFX9-NEXT: v_lshrrev_b32_e32 v54, 8, v54 ; GFX9-NEXT: v_lshrrev_b32_e32 v40, 8, v40 @@ -164278,10 +164403,10 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v35, 24, v59 ; GFX9-NEXT: v_lshrrev_b32_e32 v43, 8, v58 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v60 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v61 ; GFX9-NEXT: v_lshrrev_b32_e32 v47, 8, v61 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v60 ; GFX9-NEXT: v_lshrrev_b32_e32 v45, 8, v60 ; GFX9-NEXT: v_lshrrev_b32_e32 v60, 8, v59 ; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v58 @@ -175501,13 +175626,13 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v48 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:200 ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:196 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:212 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:220 ; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:192 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:200 ; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 ; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr37 @@ -175526,177 +175651,170 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(5) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 +; SI-NEXT: s_waitcnt vmcnt(5) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v4 ; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:224 ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:228 -; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:244 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:252 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:224 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:232 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 -; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(3) expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v6 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:232 +; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v4 ; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:240 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:256 ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:260 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:276 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:284 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:256 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:264 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 -; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(3) expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v6 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:264 +; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v4 ; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:272 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:288 ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:292 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:308 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:316 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:288 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:296 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 -; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(3) expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v6 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:296 +; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v4 ; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:304 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:320 ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:324 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:340 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:348 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:320 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:328 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 -; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(3) expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v6 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:328 +; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v4 ; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:336 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:352 ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:356 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:364 ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:372 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:380 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:352 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:360 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 -; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(3) expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v6 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:360 +; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v4 ; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:368 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:388 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:384 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:388 ; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:976 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:48 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:40 @@ -175718,11 +175836,11 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:112 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:120 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:120 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 @@ -177749,25 +177867,27 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v44 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188 ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:192 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:200 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:208 ; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:216 -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188 ; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196 +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196 +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:204 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -177775,25 +177895,27 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:212 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:220 ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:224 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:232 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:240 ; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:248 -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:220 ; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:228 +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:228 +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:236 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -177801,25 +177923,27 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:244 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:252 ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:256 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:264 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:272 ; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:280 -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:252 ; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:260 +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:260 +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:268 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -177827,25 +177951,27 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:276 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:284 ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:288 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:296 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:304 ; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:312 -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:284 ; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:292 +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:292 +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:300 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -177853,25 +177979,27 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:308 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:316 ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:320 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:328 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:336 ; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:344 -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:316 ; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:324 +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:324 +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:332 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -177879,23 +178007,23 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:340 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:348 ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:352 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:360 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:368 ; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:376 -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:348 ; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:356 +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v38, 8, v0 -; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v39, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_lshlrev_b16_e32 v49, 8, v2 -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v51, 8, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:356 -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -177903,11 +178031,9 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:372 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v53, 8, v0 -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:44 ; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:36 @@ -177923,6 +178049,8 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:68 ; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:60 ; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:52 +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_lshlrev_b16_e32 v53, 8, v0 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -178991,26 +179119,28 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v44 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:192 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188 ; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:192 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:200 ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:208 ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:216 -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188 -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196 +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:204 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -179018,26 +179148,28 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:212 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:220 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:224 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:232 ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:240 ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:248 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:220 -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:228 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:228 +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:236 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -179045,26 +179177,28 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:244 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:252 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:256 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:264 ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:272 ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:280 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:252 -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:260 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:260 +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:268 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -179072,26 +179206,28 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:276 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:284 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:288 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:296 ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:304 ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:312 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:284 -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:292 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:292 +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:300 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -179099,26 +179235,28 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:308 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:316 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:320 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:328 ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:336 ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:344 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:316 -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:324 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:324 +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:332 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -179126,24 +179264,23 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:340 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:348 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:352 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:360 ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:368 ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:376 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:348 -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_lshlrev_b16_e32 v37, 8, v0 ; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_lshlrev_b16_e32 v37, 8, v0 +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:356 +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v49, 8, v1 -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_lshlrev_b16_e32 v48, 8, v2 -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v52, 8, v3 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:356 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -179151,11 +179288,10 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:372 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v53, 8, v0 +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v53, 8, v0 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44 ; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:36 @@ -180078,50 +180214,6 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_clause 0x1f -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:384 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v39, off, s32 offset:380 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:376 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:372 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v150, off, s32 offset:368 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:364 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v150, off, s32 offset:360 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:356 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v147, off, s32 offset:352 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:348 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v149, off, s32 offset:344 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:340 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v149, off, s32 offset:336 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:332 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v148, off, s32 offset:328 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v48, off, s32 offset:324 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v145, off, s32 offset:320 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:316 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v148, off, s32 offset:312 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:308 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v147, off, s32 offset:304 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:300 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v146, off, s32 offset:296 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:292 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v146, off, s32 offset:288 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:284 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v145, off, s32 offset:280 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:276 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v134, off, s32 offset:272 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:268 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v144, off, s32 offset:264 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:260 -; GFX11-TRUE16-NEXT: s_clause 0x1f -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v144, off, s32 offset:256 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:252 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v135, off, s32 offset:248 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:244 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v132, off, s32 offset:240 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:236 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v135, off, s32 offset:232 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:228 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v134, off, s32 offset:224 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:220 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v133, off, s32 offset:216 ; GFX11-TRUE16-NEXT: scratch_load_b32 v160, off, s32 offset:388 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v103, off, s32 offset:8 @@ -180143,7 +180235,6 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v129, off, s32 offset:136 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v129, off, s32 offset:144 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v130, off, s32 offset:152 -; GFX11-TRUE16-NEXT: s_clause 0x1f ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v130, off, s32 offset:160 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v131, off, s32 offset:168 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v131, off, s32 offset:176 @@ -180151,6 +180242,51 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v133, off, s32 offset:192 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v151, off, s32 offset:200 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v151, off, s32 offset:208 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v133, off, s32 offset:216 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v134, off, s32 offset:224 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v135, off, s32 offset:232 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v132, off, s32 offset:240 +; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v135, off, s32 offset:248 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v144, off, s32 offset:256 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v144, off, s32 offset:264 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v134, off, s32 offset:272 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v145, off, s32 offset:280 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v146, off, s32 offset:288 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v146, off, s32 offset:296 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v147, off, s32 offset:304 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v148, off, s32 offset:312 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v145, off, s32 offset:320 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v148, off, s32 offset:328 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v149, off, s32 offset:336 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v149, off, s32 offset:344 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v147, off, s32 offset:352 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v150, off, s32 offset:360 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v150, off, s32 offset:368 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:376 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:384 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v39, off, s32 offset:380 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:372 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:364 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:356 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:348 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:340 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:332 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v48, off, s32 offset:324 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:316 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:308 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:300 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:292 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:284 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:276 +; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:268 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:260 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:252 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:244 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:236 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:228 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:220 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v100, off, s32 offset:212 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:204 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:196 @@ -180212,26 +180348,6 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.l, 8, v27.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.l, 8, v29.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.h, 8, v150.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.l, 8, v150.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.h, 8, v147.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v149.h, 8, v149.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v149.l, 8, v149.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.h, 8, v148.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v145.l, 8, v145.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.l, 8, v148.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.l, 8, v147.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v146.h, 8, v146.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v146.l, 8, v146.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v145.h, 8, v145.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v134.h, 8, v134.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.h, 8, v144.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.l, 8, v144.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.h, 8, v135.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.l, 8, v132.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.l, 8, v135.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v134.l, 8, v134.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.h, 8, v133.h ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v160 ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v101.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.h, 8, v103.h @@ -180260,6 +180376,26 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.h, 8, v133.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.h, 8, v151.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.l, 8, v151.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.h, 8, v133.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v134.l, 8, v134.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.l, 8, v135.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.l, 8, v132.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.h, 8, v135.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.l, 8, v144.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.h, 8, v144.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v134.h, 8, v134.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v145.h, 8, v145.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v146.l, 8, v146.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v146.h, 8, v146.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.l, 8, v147.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.l, 8, v148.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v145.l, 8, v145.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.h, 8, v148.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v149.l, 8, v149.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v149.h, 8, v149.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.h, 8, v147.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.l, 8, v150.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.h, 8, v150.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.l, 8, v31.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.h, 8, v31.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 @@ -180854,50 +180990,6 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v48, v8 :: v_dual_mov_b32 v33, v6 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v35, v2 :: v_dual_mov_b32 v38, v0 ; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:384 -; GFX11-FAKE16-NEXT: scratch_load_u16 v118, off, s32 offset:380 -; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:376 -; GFX11-FAKE16-NEXT: scratch_load_u16 v134, off, s32 offset:372 -; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:368 -; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:364 -; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:360 -; GFX11-FAKE16-NEXT: scratch_load_u16 v131, off, s32 offset:356 -; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:352 -; GFX11-FAKE16-NEXT: scratch_load_u16 v98, off, s32 offset:348 -; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:344 -; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:340 -; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:336 -; GFX11-FAKE16-NEXT: scratch_load_u16 v99, off, s32 offset:332 -; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:328 -; GFX11-FAKE16-NEXT: scratch_load_u16 v112, off, s32 offset:324 -; GFX11-FAKE16-NEXT: scratch_load_u16 v16, off, s32 offset:320 -; GFX11-FAKE16-NEXT: scratch_load_u16 v81, off, s32 offset:316 -; GFX11-FAKE16-NEXT: scratch_load_u16 v18, off, s32 offset:312 -; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:308 -; GFX11-FAKE16-NEXT: scratch_load_u16 v20, off, s32 offset:304 -; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:300 -; GFX11-FAKE16-NEXT: scratch_load_u16 v22, off, s32 offset:296 -; GFX11-FAKE16-NEXT: scratch_load_u16 v101, off, s32 offset:292 -; GFX11-FAKE16-NEXT: scratch_load_u16 v24, off, s32 offset:288 -; GFX11-FAKE16-NEXT: scratch_load_u16 v83, off, s32 offset:284 -; GFX11-FAKE16-NEXT: scratch_load_u16 v26, off, s32 offset:280 -; GFX11-FAKE16-NEXT: scratch_load_u16 v97, off, s32 offset:276 -; GFX11-FAKE16-NEXT: scratch_load_u16 v28, off, s32 offset:272 -; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:268 -; GFX11-FAKE16-NEXT: scratch_load_u16 v30, off, s32 offset:264 -; GFX11-FAKE16-NEXT: scratch_load_u16 v85, off, s32 offset:260 -; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_load_u16 v31, off, s32 offset:256 -; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:252 -; GFX11-FAKE16-NEXT: scratch_load_u16 v94, off, s32 offset:248 -; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:244 -; GFX11-FAKE16-NEXT: scratch_load_u16 v88, off, s32 offset:240 -; GFX11-FAKE16-NEXT: scratch_load_u16 v64, off, s32 offset:236 -; GFX11-FAKE16-NEXT: scratch_load_u16 v93, off, s32 offset:232 -; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:228 -; GFX11-FAKE16-NEXT: scratch_load_u16 v91, off, s32 offset:224 -; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:220 -; GFX11-FAKE16-NEXT: scratch_load_u16 v92, off, s32 offset:216 ; GFX11-FAKE16-NEXT: scratch_load_b32 v150, off, s32 offset:388 ; GFX11-FAKE16-NEXT: scratch_load_u16 v182, off, s32 ; GFX11-FAKE16-NEXT: scratch_load_u16 v40, off, s32 offset:8 @@ -180919,7 +181011,6 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_load_u16 v75, off, s32 offset:136 ; GFX11-FAKE16-NEXT: scratch_load_u16 v76, off, s32 offset:144 ; GFX11-FAKE16-NEXT: scratch_load_u16 v77, off, s32 offset:152 -; GFX11-FAKE16-NEXT: s_clause 0x1f ; GFX11-FAKE16-NEXT: scratch_load_u16 v78, off, s32 offset:160 ; GFX11-FAKE16-NEXT: scratch_load_u16 v79, off, s32 offset:168 ; GFX11-FAKE16-NEXT: scratch_load_u16 v89, off, s32 offset:176 @@ -180927,6 +181018,51 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_load_u16 v95, off, s32 offset:192 ; GFX11-FAKE16-NEXT: scratch_load_u16 v104, off, s32 offset:200 ; GFX11-FAKE16-NEXT: scratch_load_u16 v105, off, s32 offset:208 +; GFX11-FAKE16-NEXT: scratch_load_u16 v92, off, s32 offset:216 +; GFX11-FAKE16-NEXT: scratch_load_u16 v91, off, s32 offset:224 +; GFX11-FAKE16-NEXT: scratch_load_u16 v93, off, s32 offset:232 +; GFX11-FAKE16-NEXT: scratch_load_u16 v88, off, s32 offset:240 +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_load_u16 v94, off, s32 offset:248 +; GFX11-FAKE16-NEXT: scratch_load_u16 v31, off, s32 offset:256 +; GFX11-FAKE16-NEXT: scratch_load_u16 v30, off, s32 offset:264 +; GFX11-FAKE16-NEXT: scratch_load_u16 v28, off, s32 offset:272 +; GFX11-FAKE16-NEXT: scratch_load_u16 v26, off, s32 offset:280 +; GFX11-FAKE16-NEXT: scratch_load_u16 v24, off, s32 offset:288 +; GFX11-FAKE16-NEXT: scratch_load_u16 v22, off, s32 offset:296 +; GFX11-FAKE16-NEXT: scratch_load_u16 v20, off, s32 offset:304 +; GFX11-FAKE16-NEXT: scratch_load_u16 v18, off, s32 offset:312 +; GFX11-FAKE16-NEXT: scratch_load_u16 v16, off, s32 offset:320 +; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:328 +; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:336 +; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:344 +; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:352 +; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:360 +; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:368 +; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:376 +; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:384 +; GFX11-FAKE16-NEXT: scratch_load_u16 v118, off, s32 offset:380 +; GFX11-FAKE16-NEXT: scratch_load_u16 v134, off, s32 offset:372 +; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:364 +; GFX11-FAKE16-NEXT: scratch_load_u16 v131, off, s32 offset:356 +; GFX11-FAKE16-NEXT: scratch_load_u16 v98, off, s32 offset:348 +; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:340 +; GFX11-FAKE16-NEXT: scratch_load_u16 v99, off, s32 offset:332 +; GFX11-FAKE16-NEXT: scratch_load_u16 v112, off, s32 offset:324 +; GFX11-FAKE16-NEXT: scratch_load_u16 v81, off, s32 offset:316 +; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:308 +; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:300 +; GFX11-FAKE16-NEXT: scratch_load_u16 v101, off, s32 offset:292 +; GFX11-FAKE16-NEXT: scratch_load_u16 v83, off, s32 offset:284 +; GFX11-FAKE16-NEXT: scratch_load_u16 v97, off, s32 offset:276 +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:268 +; GFX11-FAKE16-NEXT: scratch_load_u16 v85, off, s32 offset:260 +; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:252 +; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:244 +; GFX11-FAKE16-NEXT: scratch_load_u16 v64, off, s32 offset:236 +; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:228 +; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:220 ; GFX11-FAKE16-NEXT: scratch_load_u16 v42, off, s32 offset:212 ; GFX11-FAKE16-NEXT: scratch_load_u16 v180, off, s32 offset:204 ; GFX11-FAKE16-NEXT: scratch_load_u16 v183, off, s32 offset:196 @@ -180971,89 +181107,71 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v167, 8, v27 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v181, 8, v29 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(62) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v127, 8, v0 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v126, 8, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v124, 8, v4 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v125, 8, v6 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v120, 8, v8 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v123, 8, v10 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v121, 8, v12 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v122, 8, v14 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v106, 8, v16 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v111, 8, v18 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v109, 8, v20 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v110, 8, v22 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v107, 8, v24 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v108, 8, v26 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(61) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v88, 8, v88 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(59) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v93, 8, v93 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(57) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v91, 8, v91 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(55) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v92, 8, v92 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(54) ; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v150 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(53) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v150, 8, v182 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(52) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v41, 8, v40 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(51) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v40, 8, v43 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(50) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v43, 8, v44 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(49) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v182, 8, v45 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(48) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v46, 8, v46 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(47) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v45, 8, v47 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(46) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v57, 8, v56 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(45) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v56, 8, v58 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(44) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v58, 8, v59 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(43) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v44, 8, v60 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(42) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v60, 8, v61 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(41) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v59, 8, v62 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(40) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v62, 8, v63 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(39) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v47, 8, v72 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(38) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v72, 8, v73 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(37) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v63, 8, v74 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(36) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v74, 8, v75 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(35) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v73, 8, v76 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(34) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v75, 8, v77 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(33) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v61, 8, v78 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(32) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v78, 8, v79 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(31) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v77, 8, v89 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(30) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v79, 8, v90 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(29) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v76, 8, v95 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(28) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v90, 8, v104 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(27) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v89, 8, v105 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v92, 8, v92 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v91, 8, v91 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v93, 8, v93 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v88, 8, v88 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v104, 8, v94 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v95, 8, v31 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v105, 8, v30 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v94, 8, v28 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(61) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v108, 8, v26 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(60) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v107, 8, v24 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(59) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v110, 8, v22 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(58) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v109, 8, v20 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(57) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v111, 8, v18 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(56) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v106, 8, v16 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(55) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v122, 8, v14 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(54) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v121, 8, v12 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(53) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v123, 8, v10 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(52) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v120, 8, v8 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(51) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v125, 8, v6 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(50) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v124, 8, v4 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(49) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v126, 8, v2 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(48) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v127, 8, v0 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -181353,10 +181471,13 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB92_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(46) ; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v134, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v118, 3 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(44) ; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v131, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v116, 3 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(42) ; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v129, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 @@ -181373,10 +181494,12 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_add_nc_u16 v116, 0x300, v1 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v30, 0x300, v2 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v98, 0x300, v3 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(40) ; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v112, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v4 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v99, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(38) ; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v103, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v123, v2 @@ -181391,10 +181514,12 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_add_nc_u16 v28, 0x300, v1 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v81, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v81, 0x300, v2 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(36) ; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v101, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v27, 0x300, v0 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v86, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(34) ; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v97, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v83, 3 @@ -181409,14 +181534,17 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_add_nc_u16 v26, 0x300, v2 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v107, v4 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v86, 0x300, v0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(32) ; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v85, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v25, 0x300, v1 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v67, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v67, 0x300, v2 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(30) ; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v80, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v68, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(28) ; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v69, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v105, v0 @@ -183580,13 +183708,13 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:68 ; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:44 ; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:52 ; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:60 -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:68 ; VI-NEXT: s_waitcnt vmcnt(10) ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:76 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -183619,19 +183747,19 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:204 ; VI-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:212 ; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:220 +; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:244 ; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:228 +; VI-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:260 ; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:236 -; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:244 ; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:252 -; VI-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:260 ; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:268 ; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:276 ; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:284 ; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:292 ; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:300 ; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:308 -; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:316 ; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:324 +; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:316 ; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill @@ -184608,11 +184736,11 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:320 ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:328 -; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:28 ; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(14) ; GFX9-NEXT: v_lshlrev_b32_e32 v39, 8, v7 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5 @@ -184620,100 +184748,108 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v1 -; GFX9-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:44 -; GFX9-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:52 -; GFX9-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:60 -; GFX9-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:68 -; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:108 ; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:84 -; GFX9-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:92 +; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:76 ; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:100 -; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:108 +; GFX9-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:92 +; GFX9-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:68 +; GFX9-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:52 +; GFX9-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:44 ; GFX9-NEXT: s_waitcnt vmcnt(14) ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(8) ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:132 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:116 ; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:124 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:132 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:140 -; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:148 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:140 +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:156 ; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:164 -; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:172 ; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:180 ; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:188 -; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:196 -; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:204 ; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:212 -; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:220 +; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:204 ; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:228 -; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:236 +; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:220 ; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:244 -; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:252 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:236 ; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:260 -; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:268 +; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:252 ; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:276 -; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:284 +; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:268 ; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:292 -; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:300 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:284 ; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:308 -; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:316 +; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:300 ; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:324 +; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:316 +; GFX9-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:156 +; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:172 +; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:196 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: s_waitcnt vmcnt(24) ; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: s_waitcnt vmcnt(24) ; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: s_waitcnt vmcnt(25) ; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(24) +; GFX9-NEXT: s_waitcnt vmcnt(28) ; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(28) ; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(28) +; GFX9-NEXT: s_waitcnt vmcnt(32) ; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(33) ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: s_waitcnt vmcnt(35) ; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(36) ; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: s_waitcnt vmcnt(38) ; GFX9-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(38) ; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: s_waitcnt vmcnt(38) ; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(38) ; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: s_waitcnt vmcnt(38) ; GFX9-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(39) ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(35) +; GFX9-NEXT: s_waitcnt vmcnt(39) ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(39) ; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(35) +; GFX9-NEXT: s_waitcnt vmcnt(39) ; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(39) ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill @@ -185001,6 +185137,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_branch .LBB93_3 ; GFX9-NEXT: .LBB93_2: +; GFX9-NEXT: s_waitcnt vmcnt(55) ; GFX9-NEXT: v_mov_b32_e32 v58, v50 ; GFX9-NEXT: v_mov_b32_e32 v45, v59 ; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload @@ -189290,8 +189427,8 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_cbranch_execz .LBB94_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v56, v38 ; VI-NEXT: v_mov_b32_e32 v45, v7 ; VI-NEXT: v_mov_b32_e32 v63, v53 @@ -189304,19 +189441,21 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v29, 24, v44 ; VI-NEXT: v_lshrrev_b32_e32 v5, 24, v32 ; VI-NEXT: v_lshrrev_b32_e32 v13, 24, v18 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshrrev_b32_e32 v2, 24, v1 ; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v2, 8, v1 ; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshrrev_b32_e32 v2, 8, v0 ; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[0:1] -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshrrev_b32_e32 v2, 8, v38 ; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshrrev_b32_e32 v2, 8, v37 ; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v2, 8, v44 @@ -189326,76 +189465,75 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v62, v36 ; VI-NEXT: v_lshrrev_b32_e32 v41, 24, v38 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshrrev_b32_e32 v2, 8, v11 ; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshrrev_b32_e32 v2, 8, v10 ; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; VI-NEXT: v_lshrrev_b32_e32 v8, 24, v11 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshrrev_b32_e32 v23, 8, v6 -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshrrev_b32_e32 v2, 24, v7 ; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v2, 8, v7 ; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_lshrrev_b32_e32 v23, 8, v6 ; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshrrev_b32_e32 v24, 8, v52 -; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshrrev_b32_e32 v57, 24, v53 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshrrev_b32_e32 v4, 24, v3 ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; VI-NEXT: v_lshrrev_b32_e32 v20, 8, v53 +; VI-NEXT: v_lshrrev_b32_e32 v24, 8, v52 +; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_lshrrev_b32_e32 v19, 8, v2 ; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v3 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshrrev_b32_e32 v4, 24, v59 ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v59 ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v58 ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v26 ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; VI-NEXT: v_lshrrev_b32_e32 v14, 24, v27 ; VI-NEXT: v_lshrrev_b32_e32 v60, 8, v27 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v33 -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v34 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v33 ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; VI-NEXT: v_lshrrev_b32_e32 v42, 24, v34 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshrrev_b32_e32 v22, 8, v35 -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshrrev_b32_e32 v9, 24, v36 ; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v61, 8, v31 ; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill @@ -189425,6 +189563,8 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[33:34] ; VI-NEXT: v_lshrrev_b32_e32 v46, 8, v36 +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_lshrrev_b32_e32 v22, 8, v35 ; VI-NEXT: v_lshrrev_b64 v[37:38], 24, v[35:36] ; VI-NEXT: v_lshrrev_b64 v[10:11], 24, v[52:53] ; VI-NEXT: v_lshrrev_b64 v[52:53], 24, v[58:59] @@ -189444,10 +189584,9 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: v_mov_b32_e32 v29, v41 ; VI-NEXT: v_mov_b32_e32 v45, v60 ; VI-NEXT: v_lshrrev_b32_e32 v41, 8, v55 -; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v49 ; VI-NEXT: v_lshrrev_b32_e32 v4, 24, v50 ; VI-NEXT: v_lshrrev_b32_e32 v30, 8, v50 +; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v49 ; VI-NEXT: v_lshrrev_b32_e32 v51, 24, v40 ; VI-NEXT: v_lshrrev_b64 v[35:36], 24, v[49:50] ; VI-NEXT: v_lshrrev_b64 v[49:50], 24, v[39:40] @@ -189498,33 +189637,35 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v62, v55, v0 ; VI-NEXT: v_add_f16_sdwa v0, v54, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; VI-NEXT: v_add_f16_e32 v54, 0x200, v54 ; VI-NEXT: v_or_b32_e32 v61, v54, v0 ; VI-NEXT: v_mov_b32_e32 v26, v54 ; VI-NEXT: v_mov_b32_e32 v27, v55 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_f16_sdwa v60, v25, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v60 ; VI-NEXT: v_add_f16_e32 v25, 0x200, v25 ; VI-NEXT: v_or_b32_e32 v34, v25, v0 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_f16_sdwa v0, v24, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v24, 0x200, v24 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; VI-NEXT: v_or_b32_e32 v33, v24, v0 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_f16_sdwa v0, v2, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; VI-NEXT: v_add_f16_e32 v2, 0x200, v2 ; VI-NEXT: v_or_b32_e32 v36, v2, v0 +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_f16_sdwa v0, v1, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill @@ -189532,41 +189673,43 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; VI-NEXT: v_or_b32_e32 v35, v1, v0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_f16_sdwa v0, v2, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; VI-NEXT: v_add_f16_e32 v2, 0x200, v2 ; VI-NEXT: v_or_b32_e32 v38, v2, v0 +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_f16_sdwa v0, v1, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; VI-NEXT: v_or_b32_e32 v37, v1, v0 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_f16_sdwa v1, v8, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_f16_sdwa v0, v9, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v9, 0x200, v9 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_f16_sdwa v1, v8, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v8, 0x200, v8 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; VI-NEXT: v_or_b32_e32 v49, v9, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_f16_sdwa v47, v3, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v3, 0x200, v3 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_f16_sdwa v1, v2, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v2, 0x200, v2 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill @@ -189581,14 +189724,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v51, v3, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 ; VI-NEXT: v_or_b32_e32 v50, v2, v0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_f16_sdwa v3, v1, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_f16_sdwa v0, v2, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; VI-NEXT: v_add_f16_e32 v2, 0x200, v2 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_f16_sdwa v3, v1, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill @@ -189599,28 +189742,28 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_sdwa v3, v44, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v44, 0x200, v44 ; VI-NEXT: v_or_b32_e32 v52, v1, v0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_f16_sdwa v59, v1, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_f16_sdwa v0, v2, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; VI-NEXT: v_add_f16_e32 v2, 0x200, v2 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_f16_sdwa v59, v1, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; VI-NEXT: v_or_b32_e32 v46, v2, v0 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v59 ; VI-NEXT: v_or_b32_e32 v45, v1, v0 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_f16_sdwa v1, v6, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_f16_sdwa v11, v7, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v7, 0x200, v7 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_f16_sdwa v1, v6, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v6, 0x200, v6 ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v11 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill @@ -189629,45 +189772,46 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 ; VI-NEXT: v_or_b32_e32 v4, v6, v0 -; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_f16_sdwa v39, v6, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_f16_sdwa v56, v7, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v7, 0x200, v7 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_f16_sdwa v39, v6, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v6, 0x200, v6 ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v56 ; VI-NEXT: v_or_b32_e32 v41, v7, v0 ; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v39 -; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_e32 v40, v6, v0 ; VI-NEXT: s_waitcnt vmcnt(5) -; VI-NEXT: v_add_f16_sdwa v19, v24, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_add_f16_sdwa v42, v25, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v25, 0x200, v25 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_add_f16_sdwa v19, v24, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v24, 0x200, v24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v42 ; VI-NEXT: v_or_b32_e32 v7, v25, v0 ; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_add_f16_sdwa v28, v2, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v2, 0x200, v2 +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_add_f16_sdwa v16, v1, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v19 -; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_add_f16_sdwa v13, v54, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v54, 0x200, v54 ; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill @@ -189679,7 +189823,6 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v31, v43, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v28 ; VI-NEXT: v_or_b32_e32 v30, v2, v0 -; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_add_f16_sdwa v2, v55, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v55, 0x200, v55 ; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill @@ -190223,8 +190366,8 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX9-NEXT: ; implicit-def: $vgpr50 ; GFX9-NEXT: ; kill: killed $vgpr50 @@ -190349,7 +190492,7 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(17) +; GFX9-NEXT: s_waitcnt vmcnt(18) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; kill: killed $vgpr33 @@ -190468,6 +190611,7 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v2 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(45) ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v32 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v32 @@ -190606,6 +190750,7 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[13:14] +; GFX9-NEXT: s_waitcnt vmcnt(19) ; GFX9-NEXT: v_pk_add_f16 v32, v32, s6 op_sel_hi:[1,0] ; GFX9-NEXT: s_waitcnt vmcnt(18) ; GFX9-NEXT: v_pk_add_f16 v31, v31, s6 op_sel_hi:[1,0] @@ -197789,6 +197934,7 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_mov_b32_e32 v54, v15 ; SI-NEXT: v_mov_b32_e32 v57, v5 ; SI-NEXT: v_mov_b32_e32 v41, v3 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:392 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:4 ; SI-NEXT: s_waitcnt expcnt(0) @@ -197808,7 +197954,6 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:164 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:148 ; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:188 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 ; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -197860,6 +198005,9 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; kill: killed $vgpr42 ; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:96 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v36 ; SI-NEXT: v_lshlrev_b32_e32 v36, 24, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v7 @@ -197884,11 +198032,8 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v21 -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v27 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:96 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v27 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v17 @@ -197916,7 +198061,7 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:160 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128 -; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:88 @@ -197950,8 +198095,8 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:108 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:180 ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:176 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:180 ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v33 @@ -197963,48 +198108,48 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:172 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:192 ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:196 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:220 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:192 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v7 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:216 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v7 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:212 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:208 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:212 ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v8 ; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:204 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:224 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:228 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:252 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:224 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:248 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:244 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:240 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:244 ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v9 @@ -198012,24 +198157,24 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v8 ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:236 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:256 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:260 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:284 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:256 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:280 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:272 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:276 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:272 ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v11 @@ -198037,24 +198182,24 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v8 ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:268 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:288 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:292 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:316 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:288 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:312 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:304 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:308 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:304 ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v9 @@ -198063,101 +198208,99 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v8 ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v51, 24, v11 -; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:300 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:320 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:324 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:348 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:320 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v51, 24, v11 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v37, 24, v4 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v63, 8, v8 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:344 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v37, 24, v4 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v63, 8, v8 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:336 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:340 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:336 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v60, 24, v9 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:332 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:352 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:356 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:380 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:352 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v60, 24, v9 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v56, 24, v4 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v8 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:376 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v56, 24, v4 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v8 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:368 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:372 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:368 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:364 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:388 ; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:384 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:388 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v26 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v32, 24, v8 -; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v11 ; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v32, 24, v8 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:120 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:44 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:44 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:64 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v35 @@ -200101,25 +200244,27 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v44 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188 ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:192 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:200 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:208 ; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:216 -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188 ; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196 +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196 +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:204 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -200127,25 +200272,27 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:212 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:220 ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:224 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:232 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:240 ; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:248 -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:220 ; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:228 +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:228 +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:236 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -200153,25 +200300,27 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:244 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:252 ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:256 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:264 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:272 ; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:280 -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:252 ; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:260 +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:260 +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:268 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -200179,25 +200328,27 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:276 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:284 ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:288 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:296 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:304 ; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:312 -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:284 ; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:292 +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:292 +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:300 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -200205,25 +200356,27 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:308 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:316 ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:320 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:328 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:336 ; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:344 -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:316 ; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:324 +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:324 +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:332 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -200231,23 +200384,23 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:340 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:348 ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:352 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:360 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:368 ; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:376 -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:348 ; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:356 +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v38, 8, v0 -; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v39, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_lshlrev_b16_e32 v49, 8, v2 -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v51, 8, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:356 -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -200255,11 +200408,9 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:372 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v53, 8, v0 -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:44 ; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:36 @@ -200275,6 +200426,8 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:68 ; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:60 ; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:52 +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_lshlrev_b16_e32 v53, 8, v0 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -201343,26 +201496,28 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v44 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:192 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188 ; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:192 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:200 ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:208 ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:216 -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188 -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196 +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:204 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -201370,26 +201525,28 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:212 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:220 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:224 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:232 ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:240 ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:248 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:220 -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:228 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:228 +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:236 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -201397,26 +201554,28 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:244 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:252 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:256 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:264 ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:272 ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:280 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:252 -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:260 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:260 +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:268 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -201424,26 +201583,28 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:276 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:284 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:288 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:296 ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:304 ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:312 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:284 -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:292 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:292 +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:300 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -201451,26 +201612,28 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:308 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:316 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:320 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:328 ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:336 ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:344 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:316 -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:324 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:324 +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:332 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -201478,24 +201641,23 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:340 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:348 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:352 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:360 ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:368 ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:376 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:348 -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_lshlrev_b16_e32 v37, 8, v0 ; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_lshlrev_b16_e32 v37, 8, v0 +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:356 +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v49, 8, v1 -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_lshlrev_b16_e32 v48, 8, v2 -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v52, 8, v3 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:356 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -201503,11 +201665,10 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:372 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v53, 8, v0 +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v53, 8, v0 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44 ; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:36 @@ -202430,50 +202591,6 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_clause 0x1f -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:384 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v39, off, s32 offset:380 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:376 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:372 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v150, off, s32 offset:368 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:364 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v150, off, s32 offset:360 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:356 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v147, off, s32 offset:352 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:348 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v149, off, s32 offset:344 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:340 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v149, off, s32 offset:336 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:332 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v148, off, s32 offset:328 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v48, off, s32 offset:324 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v145, off, s32 offset:320 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:316 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v148, off, s32 offset:312 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:308 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v147, off, s32 offset:304 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:300 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v146, off, s32 offset:296 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:292 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v146, off, s32 offset:288 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:284 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v145, off, s32 offset:280 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:276 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v134, off, s32 offset:272 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:268 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v144, off, s32 offset:264 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:260 -; GFX11-TRUE16-NEXT: s_clause 0x1f -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v144, off, s32 offset:256 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:252 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v135, off, s32 offset:248 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:244 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v132, off, s32 offset:240 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:236 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v135, off, s32 offset:232 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:228 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v134, off, s32 offset:224 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:220 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v133, off, s32 offset:216 ; GFX11-TRUE16-NEXT: scratch_load_b32 v160, off, s32 offset:388 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v103, off, s32 offset:8 @@ -202495,7 +202612,6 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v129, off, s32 offset:136 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v129, off, s32 offset:144 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v130, off, s32 offset:152 -; GFX11-TRUE16-NEXT: s_clause 0x1f ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v130, off, s32 offset:160 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v131, off, s32 offset:168 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v131, off, s32 offset:176 @@ -202503,6 +202619,51 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v133, off, s32 offset:192 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v151, off, s32 offset:200 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v151, off, s32 offset:208 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v133, off, s32 offset:216 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v134, off, s32 offset:224 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v135, off, s32 offset:232 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v132, off, s32 offset:240 +; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v135, off, s32 offset:248 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v144, off, s32 offset:256 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v144, off, s32 offset:264 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v134, off, s32 offset:272 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v145, off, s32 offset:280 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v146, off, s32 offset:288 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v146, off, s32 offset:296 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v147, off, s32 offset:304 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v148, off, s32 offset:312 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v145, off, s32 offset:320 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v148, off, s32 offset:328 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v149, off, s32 offset:336 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v149, off, s32 offset:344 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v147, off, s32 offset:352 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v150, off, s32 offset:360 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v150, off, s32 offset:368 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:376 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:384 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v39, off, s32 offset:380 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:372 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:364 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:356 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:348 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:340 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:332 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v48, off, s32 offset:324 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:316 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:308 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:300 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:292 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:284 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:276 +; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:268 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:260 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:252 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:244 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:236 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:228 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:220 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v100, off, s32 offset:212 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:204 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:196 @@ -202564,26 +202725,6 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.l, 8, v27.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.l, 8, v29.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.h, 8, v150.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.l, 8, v150.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.h, 8, v147.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v149.h, 8, v149.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v149.l, 8, v149.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.h, 8, v148.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v145.l, 8, v145.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.l, 8, v148.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.l, 8, v147.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v146.h, 8, v146.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v146.l, 8, v146.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v145.h, 8, v145.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v134.h, 8, v134.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.h, 8, v144.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.l, 8, v144.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.h, 8, v135.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.l, 8, v132.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.l, 8, v135.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v134.l, 8, v134.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.h, 8, v133.h ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v160 ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v101.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.h, 8, v103.h @@ -202612,6 +202753,26 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.h, 8, v133.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.h, 8, v151.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.l, 8, v151.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.h, 8, v133.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v134.l, 8, v134.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.l, 8, v135.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.l, 8, v132.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.h, 8, v135.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.l, 8, v144.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.h, 8, v144.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v134.h, 8, v134.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v145.h, 8, v145.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v146.l, 8, v146.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v146.h, 8, v146.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.l, 8, v147.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.l, 8, v148.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v145.l, 8, v145.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.h, 8, v148.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v149.l, 8, v149.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v149.h, 8, v149.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.h, 8, v147.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.l, 8, v150.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.h, 8, v150.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.l, 8, v31.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.h, 8, v31.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 @@ -203206,50 +203367,6 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v48, v8 :: v_dual_mov_b32 v33, v6 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v35, v2 :: v_dual_mov_b32 v38, v0 ; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:384 -; GFX11-FAKE16-NEXT: scratch_load_u16 v118, off, s32 offset:380 -; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:376 -; GFX11-FAKE16-NEXT: scratch_load_u16 v134, off, s32 offset:372 -; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:368 -; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:364 -; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:360 -; GFX11-FAKE16-NEXT: scratch_load_u16 v131, off, s32 offset:356 -; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:352 -; GFX11-FAKE16-NEXT: scratch_load_u16 v98, off, s32 offset:348 -; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:344 -; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:340 -; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:336 -; GFX11-FAKE16-NEXT: scratch_load_u16 v99, off, s32 offset:332 -; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:328 -; GFX11-FAKE16-NEXT: scratch_load_u16 v112, off, s32 offset:324 -; GFX11-FAKE16-NEXT: scratch_load_u16 v16, off, s32 offset:320 -; GFX11-FAKE16-NEXT: scratch_load_u16 v81, off, s32 offset:316 -; GFX11-FAKE16-NEXT: scratch_load_u16 v18, off, s32 offset:312 -; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:308 -; GFX11-FAKE16-NEXT: scratch_load_u16 v20, off, s32 offset:304 -; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:300 -; GFX11-FAKE16-NEXT: scratch_load_u16 v22, off, s32 offset:296 -; GFX11-FAKE16-NEXT: scratch_load_u16 v101, off, s32 offset:292 -; GFX11-FAKE16-NEXT: scratch_load_u16 v24, off, s32 offset:288 -; GFX11-FAKE16-NEXT: scratch_load_u16 v83, off, s32 offset:284 -; GFX11-FAKE16-NEXT: scratch_load_u16 v26, off, s32 offset:280 -; GFX11-FAKE16-NEXT: scratch_load_u16 v97, off, s32 offset:276 -; GFX11-FAKE16-NEXT: scratch_load_u16 v28, off, s32 offset:272 -; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:268 -; GFX11-FAKE16-NEXT: scratch_load_u16 v30, off, s32 offset:264 -; GFX11-FAKE16-NEXT: scratch_load_u16 v85, off, s32 offset:260 -; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_load_u16 v31, off, s32 offset:256 -; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:252 -; GFX11-FAKE16-NEXT: scratch_load_u16 v94, off, s32 offset:248 -; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:244 -; GFX11-FAKE16-NEXT: scratch_load_u16 v88, off, s32 offset:240 -; GFX11-FAKE16-NEXT: scratch_load_u16 v64, off, s32 offset:236 -; GFX11-FAKE16-NEXT: scratch_load_u16 v93, off, s32 offset:232 -; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:228 -; GFX11-FAKE16-NEXT: scratch_load_u16 v91, off, s32 offset:224 -; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:220 -; GFX11-FAKE16-NEXT: scratch_load_u16 v92, off, s32 offset:216 ; GFX11-FAKE16-NEXT: scratch_load_b32 v150, off, s32 offset:388 ; GFX11-FAKE16-NEXT: scratch_load_u16 v182, off, s32 ; GFX11-FAKE16-NEXT: scratch_load_u16 v40, off, s32 offset:8 @@ -203271,7 +203388,6 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_load_u16 v75, off, s32 offset:136 ; GFX11-FAKE16-NEXT: scratch_load_u16 v76, off, s32 offset:144 ; GFX11-FAKE16-NEXT: scratch_load_u16 v77, off, s32 offset:152 -; GFX11-FAKE16-NEXT: s_clause 0x1f ; GFX11-FAKE16-NEXT: scratch_load_u16 v78, off, s32 offset:160 ; GFX11-FAKE16-NEXT: scratch_load_u16 v79, off, s32 offset:168 ; GFX11-FAKE16-NEXT: scratch_load_u16 v89, off, s32 offset:176 @@ -203279,6 +203395,51 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_load_u16 v95, off, s32 offset:192 ; GFX11-FAKE16-NEXT: scratch_load_u16 v104, off, s32 offset:200 ; GFX11-FAKE16-NEXT: scratch_load_u16 v105, off, s32 offset:208 +; GFX11-FAKE16-NEXT: scratch_load_u16 v92, off, s32 offset:216 +; GFX11-FAKE16-NEXT: scratch_load_u16 v91, off, s32 offset:224 +; GFX11-FAKE16-NEXT: scratch_load_u16 v93, off, s32 offset:232 +; GFX11-FAKE16-NEXT: scratch_load_u16 v88, off, s32 offset:240 +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_load_u16 v94, off, s32 offset:248 +; GFX11-FAKE16-NEXT: scratch_load_u16 v31, off, s32 offset:256 +; GFX11-FAKE16-NEXT: scratch_load_u16 v30, off, s32 offset:264 +; GFX11-FAKE16-NEXT: scratch_load_u16 v28, off, s32 offset:272 +; GFX11-FAKE16-NEXT: scratch_load_u16 v26, off, s32 offset:280 +; GFX11-FAKE16-NEXT: scratch_load_u16 v24, off, s32 offset:288 +; GFX11-FAKE16-NEXT: scratch_load_u16 v22, off, s32 offset:296 +; GFX11-FAKE16-NEXT: scratch_load_u16 v20, off, s32 offset:304 +; GFX11-FAKE16-NEXT: scratch_load_u16 v18, off, s32 offset:312 +; GFX11-FAKE16-NEXT: scratch_load_u16 v16, off, s32 offset:320 +; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:328 +; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:336 +; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:344 +; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:352 +; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:360 +; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:368 +; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:376 +; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:384 +; GFX11-FAKE16-NEXT: scratch_load_u16 v118, off, s32 offset:380 +; GFX11-FAKE16-NEXT: scratch_load_u16 v134, off, s32 offset:372 +; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:364 +; GFX11-FAKE16-NEXT: scratch_load_u16 v131, off, s32 offset:356 +; GFX11-FAKE16-NEXT: scratch_load_u16 v98, off, s32 offset:348 +; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:340 +; GFX11-FAKE16-NEXT: scratch_load_u16 v99, off, s32 offset:332 +; GFX11-FAKE16-NEXT: scratch_load_u16 v112, off, s32 offset:324 +; GFX11-FAKE16-NEXT: scratch_load_u16 v81, off, s32 offset:316 +; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:308 +; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:300 +; GFX11-FAKE16-NEXT: scratch_load_u16 v101, off, s32 offset:292 +; GFX11-FAKE16-NEXT: scratch_load_u16 v83, off, s32 offset:284 +; GFX11-FAKE16-NEXT: scratch_load_u16 v97, off, s32 offset:276 +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:268 +; GFX11-FAKE16-NEXT: scratch_load_u16 v85, off, s32 offset:260 +; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:252 +; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:244 +; GFX11-FAKE16-NEXT: scratch_load_u16 v64, off, s32 offset:236 +; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:228 +; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:220 ; GFX11-FAKE16-NEXT: scratch_load_u16 v42, off, s32 offset:212 ; GFX11-FAKE16-NEXT: scratch_load_u16 v180, off, s32 offset:204 ; GFX11-FAKE16-NEXT: scratch_load_u16 v183, off, s32 offset:196 @@ -203323,89 +203484,71 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v167, 8, v27 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v181, 8, v29 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(62) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v127, 8, v0 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v126, 8, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v124, 8, v4 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v125, 8, v6 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v120, 8, v8 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v123, 8, v10 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v121, 8, v12 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v122, 8, v14 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v106, 8, v16 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v111, 8, v18 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v109, 8, v20 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v110, 8, v22 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v107, 8, v24 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v108, 8, v26 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(61) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v88, 8, v88 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(59) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v93, 8, v93 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(57) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v91, 8, v91 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(55) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v92, 8, v92 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(54) ; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v150 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(53) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v150, 8, v182 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(52) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v41, 8, v40 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(51) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v40, 8, v43 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(50) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v43, 8, v44 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(49) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v182, 8, v45 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(48) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v46, 8, v46 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(47) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v45, 8, v47 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(46) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v57, 8, v56 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(45) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v56, 8, v58 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(44) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v58, 8, v59 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(43) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v44, 8, v60 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(42) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v60, 8, v61 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(41) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v59, 8, v62 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(40) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v62, 8, v63 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(39) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v47, 8, v72 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(38) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v72, 8, v73 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(37) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v63, 8, v74 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(36) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v74, 8, v75 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(35) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v73, 8, v76 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(34) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v75, 8, v77 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(33) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v61, 8, v78 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(32) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v78, 8, v79 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(31) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v77, 8, v89 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(30) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v79, 8, v90 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(29) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v76, 8, v95 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(28) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v90, 8, v104 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(27) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v89, 8, v105 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v92, 8, v92 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v91, 8, v91 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v93, 8, v93 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v88, 8, v88 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v104, 8, v94 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v95, 8, v31 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v105, 8, v30 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v94, 8, v28 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(61) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v108, 8, v26 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(60) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v107, 8, v24 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(59) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v110, 8, v22 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(58) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v109, 8, v20 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(57) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v111, 8, v18 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(56) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v106, 8, v16 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(55) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v122, 8, v14 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(54) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v121, 8, v12 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(53) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v123, 8, v10 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(52) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v120, 8, v8 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(51) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v125, 8, v6 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(50) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v124, 8, v4 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(49) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v126, 8, v2 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(48) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v127, 8, v0 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -203705,10 +203848,13 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB96_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(46) ; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v134, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v118, 3 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(44) ; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v131, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v116, 3 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(42) ; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v129, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 @@ -203725,10 +203871,12 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_add_nc_u16 v116, 0x300, v1 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v30, 0x300, v2 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v98, 0x300, v3 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(40) ; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v112, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v4 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v99, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(38) ; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v103, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v123, v2 @@ -203743,10 +203891,12 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_add_nc_u16 v28, 0x300, v1 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v81, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v81, 0x300, v2 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(36) ; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v101, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v27, 0x300, v0 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v86, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(34) ; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v97, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v83, 3 @@ -203761,14 +203911,17 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_add_nc_u16 v26, 0x300, v2 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v107, v4 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v86, 0x300, v0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(32) ; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v85, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v25, 0x300, v1 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v67, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v67, 0x300, v2 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(30) ; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v80, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v68, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(28) ; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v69, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v105, v0 @@ -206003,13 +206156,13 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:68 ; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:44 ; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:52 ; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:60 -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:68 ; VI-NEXT: s_waitcnt vmcnt(10) ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:76 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -206042,19 +206195,19 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:204 ; VI-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:212 ; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:220 +; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:244 ; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:228 +; VI-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:260 ; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:236 -; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:244 ; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:252 -; VI-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:260 ; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:268 ; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:276 ; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:284 ; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:292 ; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:300 ; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:308 -; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:316 ; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:324 +; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:316 ; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill @@ -207031,11 +207184,11 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:320 ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:328 -; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:28 ; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(14) ; GFX9-NEXT: v_lshlrev_b32_e32 v39, 8, v7 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5 @@ -207043,100 +207196,108 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v1 -; GFX9-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:44 -; GFX9-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:52 -; GFX9-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:60 -; GFX9-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:68 -; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:108 ; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:84 -; GFX9-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:92 +; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:76 ; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:100 -; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:108 +; GFX9-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:92 +; GFX9-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:68 +; GFX9-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:52 +; GFX9-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:44 ; GFX9-NEXT: s_waitcnt vmcnt(14) ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(8) ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:132 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:116 ; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:124 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:132 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:140 -; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:148 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:140 +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:156 ; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:164 -; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:172 ; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:180 ; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:188 -; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:196 -; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:204 ; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:212 -; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:220 +; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:204 ; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:228 -; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:236 +; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:220 ; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:244 -; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:252 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:236 ; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:260 -; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:268 +; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:252 ; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:276 -; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:284 +; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:268 ; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:292 -; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:300 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:284 ; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:308 -; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:316 +; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:300 ; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:324 +; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:316 +; GFX9-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:156 +; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:172 +; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:196 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: s_waitcnt vmcnt(24) ; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: s_waitcnt vmcnt(24) ; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: s_waitcnt vmcnt(25) ; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(24) +; GFX9-NEXT: s_waitcnt vmcnt(28) ; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(28) ; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(28) +; GFX9-NEXT: s_waitcnt vmcnt(32) ; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(33) ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: s_waitcnt vmcnt(35) ; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(36) ; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: s_waitcnt vmcnt(38) ; GFX9-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(38) ; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: s_waitcnt vmcnt(38) ; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(38) ; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: s_waitcnt vmcnt(38) ; GFX9-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(39) ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(35) +; GFX9-NEXT: s_waitcnt vmcnt(39) ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(39) ; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(35) +; GFX9-NEXT: s_waitcnt vmcnt(39) ; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(39) ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill @@ -207424,6 +207585,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_branch .LBB97_3 ; GFX9-NEXT: .LBB97_2: +; GFX9-NEXT: s_waitcnt vmcnt(55) ; GFX9-NEXT: v_mov_b32_e32 v58, v50 ; GFX9-NEXT: v_mov_b32_e32 v45, v59 ; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload @@ -209549,6 +209711,7 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:128 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 @@ -209560,7 +209723,6 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:108 ; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:124 ; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:132 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:128 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 @@ -209593,25 +209755,25 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: ; kill: killed $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:120 +; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v13 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v19 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:120 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:116 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:112 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:116 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:104 @@ -209621,29 +209783,29 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:88 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:84 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:84 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v56 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:72 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v56 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:56 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:52 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:48 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:68 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:68 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 @@ -209789,10 +209951,10 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; kill: killed $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill ; SI-NEXT: ; kill: killed $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill ; SI-NEXT: ; kill: killed $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; kill: killed $vgpr2 @@ -209811,11 +209973,11 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:16 ; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:36 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:16 ; SI-NEXT: ; kill: killed $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; kill: killed $vgpr2 @@ -209886,26 +210048,26 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: ; kill: killed $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v60 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v59 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v63 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v60 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v59 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v63 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:100 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:100 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v61 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v61 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -212806,8 +212968,8 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX9-NEXT: ; implicit-def: $vgpr50 ; GFX9-NEXT: ; kill: killed $vgpr50 @@ -212932,7 +213094,7 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(17) +; GFX9-NEXT: s_waitcnt vmcnt(18) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; kill: killed $vgpr33 @@ -213051,6 +213213,7 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v2 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(45) ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v32 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v32 @@ -213188,6 +213351,7 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[13:14] +; GFX9-NEXT: s_waitcnt vmcnt(19) ; GFX9-NEXT: v_pk_add_u16 v32, v32, 3 op_sel_hi:[1,0] ; GFX9-NEXT: s_waitcnt vmcnt(18) ; GFX9-NEXT: v_pk_add_u16 v31, v31, 3 op_sel_hi:[1,0] @@ -220152,8 +220316,8 @@ define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76 ; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:80 ; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:92 ; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:92 ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -220223,8 +220387,9 @@ define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: v_mul_f32_e32 v44, 1.0, v44 ; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_mul_f32_e32 v45, 1.0, v45 -; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_mul_f32_e32 v46, 1.0, v46 +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_mul_f32_e32 v47, 1.0, v47 ; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_mul_f32_e32 v56, 1.0, v1 @@ -227723,8 +227888,8 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76 ; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:80 ; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:92 ; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:92 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 @@ -227907,8 +228072,9 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v44, v44 ; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_cvt_f16_f32_e32 v45, v45 -; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_cvt_f16_f32_e32 v46, v46 +; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: ; kill: killed $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr4 @@ -230712,8 +230878,8 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:80 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:92 ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:92 ; SI-NEXT: v_mul_f32_e32 v33, 1.0, v1 ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v3 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill @@ -230807,7 +230973,6 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: v_mul_f32_e32 v30, 1.0, v51 ; SI-NEXT: v_mul_f32_e32 v6, 1.0, v41 ; SI-NEXT: v_mul_f32_e32 v5, 1.0, v42 -; SI-NEXT: v_mul_f32_e32 v27, 1.0, v45 ; SI-NEXT: v_mul_f32_e32 v23, 1.0, v61 ; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; kill: killed $vgpr61 @@ -230858,6 +231023,7 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; kill: killed $vgpr61 ; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v45 ; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; kill: killed $vgpr61 ; SI-NEXT: ; implicit-def: $vgpr61 @@ -240177,8 +240343,8 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:92 ; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:92 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v34, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v9 @@ -240245,8 +240411,7 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) { ; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_cvt_f16_f32_e32 v49, v32 ; SI-NEXT: v_cvt_f16_f32_e32 v51, v60 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f16_f32_e32 v50, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v61 ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:96 @@ -240256,11 +240421,11 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112 ; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:116 ; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:120 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v61 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f16_f32_e32 v50, v37 ; SI-NEXT: v_cvt_f16_f32_e32 v35, v33 ; SI-NEXT: v_cvt_f16_f32_e32 v33, v63 ; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 -; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cvt_f16_f32_e32 v45, v39 ; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_cvt_f16_f32_e32 v14, v6 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll index 01e397d629ea9..64400ac3fff6e 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll @@ -6473,8 +6473,8 @@ define <8 x i32> @bitcast_v32i8_to_v8i32(<32 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 +; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v21.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v19.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v17.l @@ -6504,8 +6504,9 @@ define <8 x i32> @bitcast_v32i8_to_v8i32(<32 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v29.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v31.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v32 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB26_3 @@ -13668,8 +13669,8 @@ define <8 x float> @bitcast_v32i8_to_v8f32(<32 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 +; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v21.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v19.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v17.l @@ -13699,8 +13700,9 @@ define <8 x float> @bitcast_v32i8_to_v8f32(<32 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v29.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v31.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v32 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB50_3 @@ -20351,8 +20353,8 @@ define <4 x i64> @bitcast_v32i8_to_v4i64(<32 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 +; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v21.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v19.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v17.l @@ -20382,8 +20384,9 @@ define <4 x i64> @bitcast_v32i8_to_v4i64(<32 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v29.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v31.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v32 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB70_3 @@ -26536,8 +26539,8 @@ define <4 x double> @bitcast_v32i8_to_v4f64(<32 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 +; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v21.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v19.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v17.l @@ -26567,8 +26570,9 @@ define <4 x double> @bitcast_v32i8_to_v4f64(<32 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v29.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v31.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v32 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB86_3 @@ -32406,8 +32410,8 @@ define <16 x i16> @bitcast_v32i8_to_v16i16(<32 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 +; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v17.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v13.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v11.l @@ -32435,8 +32439,9 @@ define <16 x i16> @bitcast_v32i8_to_v16i16(<32 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v29.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v31.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v32 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB98_3 @@ -37671,8 +37676,8 @@ define <16 x half> @bitcast_v32i8_to_v16f16(<32 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 +; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v17.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v13.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v11.l @@ -37700,8 +37705,9 @@ define <16 x half> @bitcast_v32i8_to_v16f16(<32 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v29.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v31.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v32 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB106_3 @@ -41974,8 +41980,8 @@ define <16 x bfloat> @bitcast_v32i8_to_v16bf16(<32 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 +; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v17.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v13.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v11.l @@ -42003,8 +42009,9 @@ define <16 x bfloat> @bitcast_v32i8_to_v16bf16(<32 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v29.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v31.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v32 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB110_3 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll index 9041f64cb17fb..b2b1cdb1f58ca 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll @@ -4278,14 +4278,14 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) { ; SI-NEXT: v_mov_b32_e32 v33, v4 ; SI-NEXT: v_mov_b32_e32 v32, v2 ; SI-NEXT: v_mov_b32_e32 v31, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:8 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:4 @@ -4305,17 +4305,17 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v23, 24, v27 ; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v29 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v0 -; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v0 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v4 +; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v8 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v44 ; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_lshlrev_b32_e32 v13, 8, v6 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v8 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v44 +; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v4 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -4572,13 +4572,13 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) { ; VI-NEXT: v_mov_b32_e32 v33, v4 ; VI-NEXT: v_mov_b32_e32 v32, v2 ; VI-NEXT: v_mov_b32_e32 v31, v0 -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36 -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:32 -; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:28 -; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 ; VI-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:8 ; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:28 ; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:20 ; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:12 ; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:4 @@ -4598,17 +4598,17 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) { ; VI-NEXT: v_lshlrev_b16_e32 v23, 8, v27 ; VI-NEXT: v_lshlrev_b16_e32 v21, 8, v29 ; VI-NEXT: s_waitcnt vmcnt(9) -; VI-NEXT: v_lshlrev_b16_e32 v19, 8, v0 -; VI-NEXT: s_waitcnt vmcnt(8) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; VI-NEXT: s_waitcnt vmcnt(8) +; VI-NEXT: v_lshlrev_b16_e32 v19, 8, v0 ; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_lshlrev_b16_e32 v11, 8, v4 +; VI-NEXT: v_lshlrev_b16_e32 v17, 8, v8 +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_lshlrev_b16_e32 v15, 8, v44 ; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v13, 8, v6 ; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_lshlrev_b16_e32 v17, 8, v8 -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b16_e32 v15, 8, v44 +; VI-NEXT: v_lshlrev_b16_e32 v11, 8, v4 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -4784,13 +4784,13 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) { ; GFX9-NEXT: v_mov_b32_e32 v33, v4 ; GFX9-NEXT: v_mov_b32_e32 v32, v2 ; GFX9-NEXT: v_mov_b32_e32 v31, v0 -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36 -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:32 -; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:28 -; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 ; GFX9-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:8 ; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:28 ; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:20 ; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:12 ; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:4 @@ -4810,17 +4810,17 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) { ; GFX9-NEXT: v_lshlrev_b16_e32 v23, 8, v27 ; GFX9-NEXT: v_lshlrev_b16_e32 v21, 8, v29 ; GFX9-NEXT: s_waitcnt vmcnt(9) -; GFX9-NEXT: v_lshlrev_b16_e32 v19, 8, v0 -; GFX9-NEXT: s_waitcnt vmcnt(8) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: v_lshlrev_b16_e32 v19, 8, v0 ; GFX9-NEXT: s_waitcnt vmcnt(7) -; GFX9-NEXT: v_lshlrev_b16_e32 v11, 8, v4 +; GFX9-NEXT: v_lshlrev_b16_e32 v17, 8, v8 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b16_e32 v15, 8, v44 ; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v13, 8, v6 ; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_lshlrev_b16_e32 v17, 8, v8 -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_lshlrev_b16_e32 v15, 8, v44 +; GFX9-NEXT: v_lshlrev_b16_e32 v11, 8, v4 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -4987,12 +4987,12 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_clause 0x9 -; GFX11-TRUE16-NEXT: scratch_load_b32 v36, off, s32 offset:36 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:8 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:16 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:24 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_load_b32 v36, off, s32 offset:36 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:28 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:20 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:12 @@ -5027,14 +5027,15 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v29.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v33.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v33.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v34.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v34.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v35.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4) ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v36 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB14_3 @@ -11180,14 +11181,14 @@ define <10 x float> @bitcast_v40i8_to_v10f32(<40 x i8> %a, i32 %b) { ; SI-NEXT: v_mov_b32_e32 v33, v4 ; SI-NEXT: v_mov_b32_e32 v32, v2 ; SI-NEXT: v_mov_b32_e32 v31, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:8 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:4 @@ -11207,17 +11208,17 @@ define <10 x float> @bitcast_v40i8_to_v10f32(<40 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v23, 24, v27 ; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v29 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v0 -; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v0 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v4 +; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v8 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v44 ; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_lshlrev_b32_e32 v13, 8, v6 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v8 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v44 +; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v4 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -11474,13 +11475,13 @@ define <10 x float> @bitcast_v40i8_to_v10f32(<40 x i8> %a, i32 %b) { ; VI-NEXT: v_mov_b32_e32 v33, v4 ; VI-NEXT: v_mov_b32_e32 v32, v2 ; VI-NEXT: v_mov_b32_e32 v31, v0 -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36 -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:32 -; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:28 -; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 ; VI-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:8 ; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:28 ; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:20 ; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:12 ; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:4 @@ -11500,17 +11501,17 @@ define <10 x float> @bitcast_v40i8_to_v10f32(<40 x i8> %a, i32 %b) { ; VI-NEXT: v_lshlrev_b16_e32 v23, 8, v27 ; VI-NEXT: v_lshlrev_b16_e32 v21, 8, v29 ; VI-NEXT: s_waitcnt vmcnt(9) -; VI-NEXT: v_lshlrev_b16_e32 v19, 8, v0 -; VI-NEXT: s_waitcnt vmcnt(8) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; VI-NEXT: s_waitcnt vmcnt(8) +; VI-NEXT: v_lshlrev_b16_e32 v19, 8, v0 ; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_lshlrev_b16_e32 v11, 8, v4 +; VI-NEXT: v_lshlrev_b16_e32 v17, 8, v8 +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_lshlrev_b16_e32 v15, 8, v44 ; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v13, 8, v6 ; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_lshlrev_b16_e32 v17, 8, v8 -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b16_e32 v15, 8, v44 +; VI-NEXT: v_lshlrev_b16_e32 v11, 8, v4 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -11686,13 +11687,13 @@ define <10 x float> @bitcast_v40i8_to_v10f32(<40 x i8> %a, i32 %b) { ; GFX9-NEXT: v_mov_b32_e32 v33, v4 ; GFX9-NEXT: v_mov_b32_e32 v32, v2 ; GFX9-NEXT: v_mov_b32_e32 v31, v0 -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36 -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:32 -; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:28 -; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 ; GFX9-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:8 ; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:28 ; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:20 ; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:12 ; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:4 @@ -11712,17 +11713,17 @@ define <10 x float> @bitcast_v40i8_to_v10f32(<40 x i8> %a, i32 %b) { ; GFX9-NEXT: v_lshlrev_b16_e32 v23, 8, v27 ; GFX9-NEXT: v_lshlrev_b16_e32 v21, 8, v29 ; GFX9-NEXT: s_waitcnt vmcnt(9) -; GFX9-NEXT: v_lshlrev_b16_e32 v19, 8, v0 -; GFX9-NEXT: s_waitcnt vmcnt(8) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: v_lshlrev_b16_e32 v19, 8, v0 ; GFX9-NEXT: s_waitcnt vmcnt(7) -; GFX9-NEXT: v_lshlrev_b16_e32 v11, 8, v4 +; GFX9-NEXT: v_lshlrev_b16_e32 v17, 8, v8 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b16_e32 v15, 8, v44 ; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v13, 8, v6 ; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_lshlrev_b16_e32 v17, 8, v8 -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_lshlrev_b16_e32 v15, 8, v44 +; GFX9-NEXT: v_lshlrev_b16_e32 v11, 8, v4 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -11889,12 +11890,12 @@ define <10 x float> @bitcast_v40i8_to_v10f32(<40 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_clause 0x9 -; GFX11-TRUE16-NEXT: scratch_load_b32 v36, off, s32 offset:36 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:8 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:16 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:24 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_load_b32 v36, off, s32 offset:36 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:28 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:20 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:12 @@ -11929,14 +11930,15 @@ define <10 x float> @bitcast_v40i8_to_v10f32(<40 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v29.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v33.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v33.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v34.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v34.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v35.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4) ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v36 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB34_3 @@ -17628,15 +17630,15 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) { ; SI-NEXT: v_mov_b32_e32 v31, v14 ; SI-NEXT: v_mov_b32_e32 v33, v12 ; SI-NEXT: v_mov_b32_e32 v38, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:24 ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:24 ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:16 ; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:4 ; SI-NEXT: v_lshlrev_b32_e32 v48, 8, v5 ; SI-NEXT: v_lshlrev_b32_e32 v39, 24, v7 @@ -17669,24 +17671,24 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: s_waitcnt vmcnt(9) expcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v47, 24, v10 -; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; SI-NEXT: s_waitcnt vmcnt(7) expcnt(3) +; SI-NEXT: s_waitcnt vmcnt(8) expcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v47, 24, v10 +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v58, 8, v16 +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_lshlrev_b32_e32 v56, 24, v14 +; SI-NEXT: s_waitcnt vmcnt(5) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v59, 24, v18 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v57, 8, v32 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: s_waitcnt vmcnt(5) expcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v58, 8, v16 ; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: s_waitcnt vmcnt(3) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v59, 24, v18 ; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v57, 8, v32 -; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB50_2 @@ -17740,6 +17742,7 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v14, v12, v15 ; SI-NEXT: v_and_b32_e32 v12, 0xff, v26 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v16, 0xff, v42 ; SI-NEXT: v_and_b32_e32 v17, 0xff, v55 ; SI-NEXT: v_or_b32_e32 v34, v0, v7 @@ -17754,6 +17757,7 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 ; SI-NEXT: v_or_b32_e32 v18, v16, v19 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v16, 0xff, v43 ; SI-NEXT: v_or_b32_e32 v12, v0, v12 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -17978,13 +17982,13 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) { ; VI-NEXT: v_mov_b32_e32 v38, v4 ; VI-NEXT: v_mov_b32_e32 v32, v2 ; VI-NEXT: v_mov_b32_e32 v36, v0 -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36 -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:32 -; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:28 -; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 ; VI-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:8 ; VI-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:28 ; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:20 ; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:12 ; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:4 @@ -18006,17 +18010,17 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) { ; VI-NEXT: v_lshlrev_b16_e32 v27, 8, v27 ; VI-NEXT: v_lshlrev_b16_e32 v29, 8, v29 ; VI-NEXT: s_waitcnt vmcnt(9) -; VI-NEXT: v_lshlrev_b16_e32 v43, 8, v0 -; VI-NEXT: s_waitcnt vmcnt(8) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; VI-NEXT: s_waitcnt vmcnt(8) +; VI-NEXT: v_lshlrev_b16_e32 v43, 8, v0 ; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_lshlrev_b16_e32 v47, 8, v4 +; VI-NEXT: v_lshlrev_b16_e32 v44, 8, v8 +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_lshlrev_b16_e32 v45, 8, v10 ; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v46, 8, v6 ; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_lshlrev_b16_e32 v44, 8, v8 -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b16_e32 v45, 8, v10 +; VI-NEXT: v_lshlrev_b16_e32 v47, 8, v4 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -18097,6 +18101,7 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) { ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB50_4 ; VI-NEXT: ; %bb.3: ; %cmp.true +; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_add_u16_e32 v0, 3, v55 ; VI-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_mov_b32_e32 v1, 0x300 @@ -18201,13 +18206,13 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) { ; GFX9-NEXT: v_mov_b32_e32 v35, v4 ; GFX9-NEXT: v_mov_b32_e32 v33, v2 ; GFX9-NEXT: v_mov_b32_e32 v36, v0 -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36 -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:32 -; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:28 -; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 ; GFX9-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:8 ; GFX9-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:28 ; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:20 ; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:12 ; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:4 @@ -18229,17 +18234,17 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) { ; GFX9-NEXT: v_lshlrev_b16_e32 v23, 8, v27 ; GFX9-NEXT: v_lshlrev_b16_e32 v29, 8, v29 ; GFX9-NEXT: s_waitcnt vmcnt(9) -; GFX9-NEXT: v_lshlrev_b16_e32 v27, 8, v0 -; GFX9-NEXT: s_waitcnt vmcnt(8) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: v_lshlrev_b16_e32 v27, 8, v0 ; GFX9-NEXT: s_waitcnt vmcnt(7) -; GFX9-NEXT: v_lshlrev_b16_e32 v46, 8, v4 +; GFX9-NEXT: v_lshlrev_b16_e32 v45, 8, v8 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b16_e32 v44, 8, v10 ; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v47, 8, v6 ; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_lshlrev_b16_e32 v45, 8, v8 -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_lshlrev_b16_e32 v44, 8, v10 +; GFX9-NEXT: v_lshlrev_b16_e32 v46, 8, v4 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -18411,17 +18416,17 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_clause 0x9 -; GFX11-TRUE16-NEXT: scratch_load_b32 v37, off, s32 offset:36 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:8 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:16 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:24 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_load_b32 v37, off, s32 offset:36 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:28 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:20 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:12 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:4 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v29.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v27.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v25.l @@ -18463,8 +18468,9 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v34.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v34.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v35.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.l, 8, v36.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4) ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v37 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB50_3 @@ -23933,13 +23939,13 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) { ; SI-NEXT: v_mov_b32_e32 v36, v4 ; SI-NEXT: v_mov_b32_e32 v31, v2 ; SI-NEXT: v_mov_b32_e32 v35, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 @@ -23974,21 +23980,21 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: s_waitcnt vmcnt(9) expcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v47, 8, v0 -; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(8) expcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v47, 8, v0 +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v56, 8, v33 +; SI-NEXT: s_waitcnt vmcnt(6) expcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v57, 8, v34 +; SI-NEXT: s_waitcnt vmcnt(5) expcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v58, 8, v32 +; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v59, 8, v4 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v58, 8, v32 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v56, 8, v33 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v57, 8, v34 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr34 @@ -24101,6 +24107,7 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB62_4 ; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: v_or_b32_e32 v0, v59, v0 @@ -24246,13 +24253,13 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) { ; VI-NEXT: v_mov_b32_e32 v38, v4 ; VI-NEXT: v_mov_b32_e32 v32, v2 ; VI-NEXT: v_mov_b32_e32 v36, v0 -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36 -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:32 -; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:28 -; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 ; VI-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:8 ; VI-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:28 ; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:20 ; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:12 ; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:4 @@ -24274,17 +24281,17 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) { ; VI-NEXT: v_lshlrev_b16_e32 v27, 8, v27 ; VI-NEXT: v_lshlrev_b16_e32 v29, 8, v29 ; VI-NEXT: s_waitcnt vmcnt(9) -; VI-NEXT: v_lshlrev_b16_e32 v43, 8, v0 -; VI-NEXT: s_waitcnt vmcnt(8) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; VI-NEXT: s_waitcnt vmcnt(8) +; VI-NEXT: v_lshlrev_b16_e32 v43, 8, v0 ; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_lshlrev_b16_e32 v47, 8, v4 +; VI-NEXT: v_lshlrev_b16_e32 v44, 8, v8 +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_lshlrev_b16_e32 v45, 8, v10 ; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v46, 8, v6 ; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_lshlrev_b16_e32 v44, 8, v8 -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b16_e32 v45, 8, v10 +; VI-NEXT: v_lshlrev_b16_e32 v47, 8, v4 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -24365,6 +24372,7 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) { ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB62_4 ; VI-NEXT: ; %bb.3: ; %cmp.true +; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_add_u16_e32 v0, 3, v55 ; VI-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_mov_b32_e32 v1, 0x300 @@ -24469,13 +24477,13 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) { ; GFX9-NEXT: v_mov_b32_e32 v35, v4 ; GFX9-NEXT: v_mov_b32_e32 v33, v2 ; GFX9-NEXT: v_mov_b32_e32 v36, v0 -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36 -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:32 -; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:28 -; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 ; GFX9-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:8 ; GFX9-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:28 ; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:20 ; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:12 ; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:4 @@ -24497,17 +24505,17 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) { ; GFX9-NEXT: v_lshlrev_b16_e32 v23, 8, v27 ; GFX9-NEXT: v_lshlrev_b16_e32 v29, 8, v29 ; GFX9-NEXT: s_waitcnt vmcnt(9) -; GFX9-NEXT: v_lshlrev_b16_e32 v27, 8, v0 -; GFX9-NEXT: s_waitcnt vmcnt(8) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: v_lshlrev_b16_e32 v27, 8, v0 ; GFX9-NEXT: s_waitcnt vmcnt(7) -; GFX9-NEXT: v_lshlrev_b16_e32 v46, 8, v4 +; GFX9-NEXT: v_lshlrev_b16_e32 v45, 8, v8 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b16_e32 v44, 8, v10 ; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v47, 8, v6 ; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_lshlrev_b16_e32 v45, 8, v8 -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_lshlrev_b16_e32 v44, 8, v10 +; GFX9-NEXT: v_lshlrev_b16_e32 v46, 8, v4 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -24679,17 +24687,17 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_clause 0x9 -; GFX11-TRUE16-NEXT: scratch_load_b32 v37, off, s32 offset:36 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:8 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:16 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:24 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_load_b32 v37, off, s32 offset:36 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:28 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:20 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:12 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:4 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v29.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v27.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v25.l @@ -24731,8 +24739,9 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v34.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v34.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v35.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.l, 8, v36.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4) ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v37 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB62_3 @@ -28267,13 +28276,13 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) { ; SI-NEXT: v_mov_b32_e32 v33, v4 ; SI-NEXT: v_mov_b32_e32 v32, v2 ; SI-NEXT: v_mov_b32_e32 v31, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 @@ -28296,17 +28305,17 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v51, 24, v27 ; SI-NEXT: v_lshlrev_b32_e32 v27, 8, v29 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v0 -; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v0 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v4 +; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v8 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v10 ; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_lshlrev_b32_e32 v19, 8, v6 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v8 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v10 +; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v4 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -28572,13 +28581,13 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) { ; VI-NEXT: v_mov_b32_e32 v33, v4 ; VI-NEXT: v_mov_b32_e32 v32, v2 ; VI-NEXT: v_mov_b32_e32 v31, v0 -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36 -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:32 -; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:28 -; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 ; VI-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:8 ; VI-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:28 ; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:20 ; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:12 ; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:4 @@ -28600,17 +28609,17 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) { ; VI-NEXT: v_lshlrev_b16_e32 v51, 8, v27 ; VI-NEXT: v_lshlrev_b16_e32 v27, 8, v29 ; VI-NEXT: s_waitcnt vmcnt(9) -; VI-NEXT: v_lshlrev_b16_e32 v25, 8, v0 -; VI-NEXT: s_waitcnt vmcnt(8) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; VI-NEXT: s_waitcnt vmcnt(8) +; VI-NEXT: v_lshlrev_b16_e32 v25, 8, v0 ; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_lshlrev_b16_e32 v17, 8, v4 +; VI-NEXT: v_lshlrev_b16_e32 v23, 8, v8 +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_lshlrev_b16_e32 v21, 8, v10 ; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v19, 8, v6 ; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_lshlrev_b16_e32 v23, 8, v8 -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b16_e32 v21, 8, v10 +; VI-NEXT: v_lshlrev_b16_e32 v17, 8, v4 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -28795,13 +28804,13 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) { ; GFX9-NEXT: v_mov_b32_e32 v33, v4 ; GFX9-NEXT: v_mov_b32_e32 v32, v2 ; GFX9-NEXT: v_mov_b32_e32 v31, v0 -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36 -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:32 -; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:28 -; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 ; GFX9-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:8 ; GFX9-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:28 ; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:20 ; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:12 ; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:4 @@ -28823,17 +28832,17 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) { ; GFX9-NEXT: v_lshlrev_b16_e32 v51, 8, v27 ; GFX9-NEXT: v_lshlrev_b16_e32 v27, 8, v29 ; GFX9-NEXT: s_waitcnt vmcnt(9) -; GFX9-NEXT: v_lshlrev_b16_e32 v25, 8, v0 -; GFX9-NEXT: s_waitcnt vmcnt(8) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: v_lshlrev_b16_e32 v25, 8, v0 ; GFX9-NEXT: s_waitcnt vmcnt(7) -; GFX9-NEXT: v_lshlrev_b16_e32 v17, 8, v4 +; GFX9-NEXT: v_lshlrev_b16_e32 v23, 8, v8 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b16_e32 v21, 8, v10 ; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v19, 8, v6 ; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_lshlrev_b16_e32 v23, 8, v8 -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_lshlrev_b16_e32 v21, 8, v10 +; GFX9-NEXT: v_lshlrev_b16_e32 v17, 8, v4 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -29004,17 +29013,17 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_clause 0x9 -; GFX11-TRUE16-NEXT: scratch_load_b32 v49, off, s32 offset:36 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:8 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:16 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:24 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_load_b32 v49, off, s32 offset:36 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:28 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:20 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:12 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:4 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v29.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v27.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v25.l @@ -29051,6 +29060,7 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v37.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v37.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v38.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4) ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v49 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB72_3 @@ -32316,13 +32326,13 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) { ; SI-NEXT: v_mov_b32_e32 v33, v4 ; SI-NEXT: v_mov_b32_e32 v32, v2 ; SI-NEXT: v_mov_b32_e32 v31, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 @@ -32345,17 +32355,17 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v51, 24, v27 ; SI-NEXT: v_lshlrev_b32_e32 v27, 8, v29 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v0 -; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v0 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v4 +; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v8 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v10 ; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_lshlrev_b32_e32 v19, 8, v6 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v8 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v10 +; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v4 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -32621,13 +32631,13 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) { ; VI-NEXT: v_mov_b32_e32 v33, v4 ; VI-NEXT: v_mov_b32_e32 v32, v2 ; VI-NEXT: v_mov_b32_e32 v31, v0 -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36 -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:32 -; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:28 -; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 ; VI-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:8 ; VI-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:28 ; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:20 ; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:12 ; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:4 @@ -32649,17 +32659,17 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) { ; VI-NEXT: v_lshlrev_b16_e32 v51, 8, v27 ; VI-NEXT: v_lshlrev_b16_e32 v27, 8, v29 ; VI-NEXT: s_waitcnt vmcnt(9) -; VI-NEXT: v_lshlrev_b16_e32 v25, 8, v0 -; VI-NEXT: s_waitcnt vmcnt(8) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; VI-NEXT: s_waitcnt vmcnt(8) +; VI-NEXT: v_lshlrev_b16_e32 v25, 8, v0 ; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_lshlrev_b16_e32 v17, 8, v4 +; VI-NEXT: v_lshlrev_b16_e32 v23, 8, v8 +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_lshlrev_b16_e32 v21, 8, v10 ; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v19, 8, v6 ; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_lshlrev_b16_e32 v23, 8, v8 -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b16_e32 v21, 8, v10 +; VI-NEXT: v_lshlrev_b16_e32 v17, 8, v4 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -32844,13 +32854,13 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) { ; GFX9-NEXT: v_mov_b32_e32 v33, v4 ; GFX9-NEXT: v_mov_b32_e32 v32, v2 ; GFX9-NEXT: v_mov_b32_e32 v31, v0 -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36 -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:32 -; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:28 -; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 ; GFX9-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:8 ; GFX9-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:28 ; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:20 ; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:12 ; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:4 @@ -32872,17 +32882,17 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) { ; GFX9-NEXT: v_lshlrev_b16_e32 v51, 8, v27 ; GFX9-NEXT: v_lshlrev_b16_e32 v27, 8, v29 ; GFX9-NEXT: s_waitcnt vmcnt(9) -; GFX9-NEXT: v_lshlrev_b16_e32 v25, 8, v0 -; GFX9-NEXT: s_waitcnt vmcnt(8) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: v_lshlrev_b16_e32 v25, 8, v0 ; GFX9-NEXT: s_waitcnt vmcnt(7) -; GFX9-NEXT: v_lshlrev_b16_e32 v17, 8, v4 +; GFX9-NEXT: v_lshlrev_b16_e32 v23, 8, v8 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b16_e32 v21, 8, v10 ; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v19, 8, v6 ; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_lshlrev_b16_e32 v23, 8, v8 -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_lshlrev_b16_e32 v21, 8, v10 +; GFX9-NEXT: v_lshlrev_b16_e32 v17, 8, v4 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -33053,17 +33063,17 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_clause 0x9 -; GFX11-TRUE16-NEXT: scratch_load_b32 v49, off, s32 offset:36 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:8 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:16 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:24 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_load_b32 v49, off, s32 offset:36 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:28 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:20 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:12 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:4 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v29.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v27.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v25.l @@ -33100,6 +33110,7 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v37.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v37.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v38.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4) ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v49 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB76_3 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll index ee23420c2a662..ba195133dd5d1 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll @@ -11426,17 +11426,17 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v44 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 ; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:44 ; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:36 ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 ; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_lshlrev_b32_e32 v50, 24, v58 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v47 @@ -12838,18 +12838,6 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_clause 0x1f -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:128 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:124 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:120 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:116 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:112 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:108 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:104 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:100 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:96 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:92 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:88 -; GFX11-TRUE16-NEXT: scratch_load_b32 v81, off, s32 offset:132 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:8 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:16 @@ -12861,6 +12849,18 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:64 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:72 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:80 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:88 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:96 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:104 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:112 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:120 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:128 +; GFX11-TRUE16-NEXT: scratch_load_b32 v81, off, s32 offset:132 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:124 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:116 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:108 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:100 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:92 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:84 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:76 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:68 @@ -12873,7 +12873,7 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:12 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:4 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(23) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.h, v29.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v22.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v20.l @@ -12904,12 +12904,6 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v80.h ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v64.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v64.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v65.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v65.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v66.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v66.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.h, 8, v67.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v67.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v68.l @@ -12921,6 +12915,16 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v71.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v71.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v80.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(21) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v66.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v66.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(19) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v65.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v65.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(17) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v64.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v64.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16) ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v81 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB26_3 @@ -13204,17 +13208,6 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, v6 :: v_dual_mov_b32 v33, v4 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, v2 :: v_dual_mov_b32 v31, v0 ; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:128 -; GFX11-FAKE16-NEXT: scratch_load_u16 v39, off, s32 offset:124 -; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:120 -; GFX11-FAKE16-NEXT: scratch_load_u16 v48, off, s32 offset:116 -; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:112 -; GFX11-FAKE16-NEXT: scratch_load_u16 v49, off, s32 offset:108 -; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:104 -; GFX11-FAKE16-NEXT: scratch_load_u16 v50, off, s32 offset:100 -; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:96 -; GFX11-FAKE16-NEXT: scratch_load_u16 v51, off, s32 offset:92 -; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:88 ; GFX11-FAKE16-NEXT: scratch_load_b32 v12, off, s32 offset:132 ; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 ; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:8 @@ -13227,6 +13220,17 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:64 ; GFX11-FAKE16-NEXT: scratch_load_u16 v130, off, s32 offset:72 ; GFX11-FAKE16-NEXT: scratch_load_u16 v131, off, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:88 +; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:96 +; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:104 +; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:112 +; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:120 +; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:128 +; GFX11-FAKE16-NEXT: scratch_load_u16 v39, off, s32 offset:124 +; GFX11-FAKE16-NEXT: scratch_load_u16 v48, off, s32 offset:116 +; GFX11-FAKE16-NEXT: scratch_load_u16 v49, off, s32 offset:108 +; GFX11-FAKE16-NEXT: scratch_load_u16 v50, off, s32 offset:100 +; GFX11-FAKE16-NEXT: scratch_load_u16 v51, off, s32 offset:92 ; GFX11-FAKE16-NEXT: scratch_load_u16 v52, off, s32 offset:84 ; GFX11-FAKE16-NEXT: scratch_load_u16 v53, off, s32 offset:76 ; GFX11-FAKE16-NEXT: scratch_load_u16 v54, off, s32 offset:68 @@ -13255,40 +13259,41 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v96, 8, v27 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v97, 8, v29 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(33) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v25, 8, v0 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(31) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v23, 8, v2 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(29) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v21, 8, v4 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(27) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v19, 8, v6 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(25) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v17, 8, v8 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(22) ; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(21) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(32) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v98, 8, v14 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(20) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(31) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v99, 8, v65 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(19) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(30) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v81, 8, v66 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(18) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(29) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v82, 8, v67 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(17) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(28) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v83, 8, v83 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(16) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(27) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v84, 8, v84 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(15) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(26) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v85, 8, v85 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(14) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(25) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v27, 8, v128 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(13) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(24) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v29, 8, v129 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(12) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(23) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v65, 8, v130 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(11) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(22) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v66, 8, v131 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(21) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v67, 8, v10 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(20) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v17, 8, v8 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(19) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v19, 8, v6 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(18) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v21, 8, v4 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(17) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v23, 8, v2 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(16) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v25, 8, v0 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -13711,22 +13716,22 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:36 ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:44 ; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:52 ; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:68 ; SI-NEXT: v_lshlrev_b32_e32 v50, 24, v1 ; SI-NEXT: v_lshlrev_b32_e32 v39, 8, v3 @@ -13748,21 +13753,19 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v2 ; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v4 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v6 -; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v52 -; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v10 -; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v12 -; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_lshlrev_b32_e32 v29, 8, v14 -; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_lshlrev_b32_e32 v14, 24, v28 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_lshlrev_b32_e32 v28, 8, v26 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v24 ; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill @@ -13817,33 +13820,41 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: v_or_b32_e32 v1, v25, v1 ; SI-NEXT: v_or_b32_e32 v10, v0, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v30 +; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v42 ; SI-NEXT: v_or_b32_e32 v0, v0, v23 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v11, v1 ; SI-NEXT: v_or_b32_e32 v11, v0, v1 +; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_and_b32_e32 v0, 0xff, v41 +; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v40 ; SI-NEXT: v_or_b32_e32 v0, v0, v21 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v19, v1 ; SI-NEXT: v_or_b32_e32 v12, v0, v1 +; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_and_b32_e32 v0, 0xff, v35 +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v55 ; SI-NEXT: v_or_b32_e32 v0, v0, v17 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v13, v1 ; SI-NEXT: v_or_b32_e32 v13, v0, v1 +; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_and_b32_e32 v0, 0xff, v36 +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v53 ; SI-NEXT: v_or_b32_e32 v0, v0, v29 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v14, v1 ; SI-NEXT: v_or_b32_e32 v14, v0, v1 +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_and_b32_e32 v0, 0xff, v37 ; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v43 @@ -14119,6 +14130,7 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: .LBB27_4: ; SI-NEXT: v_mov_b32_e32 v27, v44 ; SI-NEXT: v_mov_b32_e32 v26, v8 +; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_mov_b32_e32 v52, v42 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_branch .LBB27_2 @@ -14155,22 +14167,22 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 ; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:8 -; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:4 ; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:16 -; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:12 ; VI-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:24 -; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:20 ; VI-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:32 -; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:28 ; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:40 -; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:36 ; VI-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:48 -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:44 ; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:56 -; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:52 ; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:64 -; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:60 ; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:72 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:60 ; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:68 ; VI-NEXT: v_mov_b32_e32 v37, v30 ; VI-NEXT: v_lshlrev_b32_e32 v30, 8, v1 @@ -14192,21 +14204,19 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; VI-NEXT: v_lshlrev_b32_e32 v21, 8, v2 ; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v33 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_lshlrev_b32_e32 v19, 8, v6 -; VI-NEXT: s_waitcnt vmcnt(13) ; VI-NEXT: v_lshlrev_b32_e32 v29, 8, v8 -; VI-NEXT: s_waitcnt vmcnt(11) ; VI-NEXT: v_lshlrev_b32_e32 v33, 8, v10 -; VI-NEXT: s_waitcnt vmcnt(9) +; VI-NEXT: s_waitcnt vmcnt(13) ; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v12 -; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_waitcnt vmcnt(12) ; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v14 -; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: s_waitcnt vmcnt(11) ; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v28 -; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: s_waitcnt vmcnt(10) ; VI-NEXT: v_lshlrev_b32_e32 v31, 8, v26 -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(9) ; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v24 ; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill @@ -14235,19 +14245,27 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32 ; VI-NEXT: v_or_b32_sdwa v1, v36, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v37, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(13) ; VI-NEXT: v_or_b32_sdwa v1, v40, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(12) ; VI-NEXT: v_or_b32_sdwa v0, v55, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(11) ; VI-NEXT: v_or_b32_sdwa v1, v54, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(10) ; VI-NEXT: v_or_b32_sdwa v0, v53, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(9) ; VI-NEXT: v_or_b32_sdwa v1, v52, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(8) ; VI-NEXT: v_or_b32_sdwa v0, v42, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_or_b32_sdwa v1, v41, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s28, 0xff ; VI-NEXT: s_lshl_b32 s5, s29, 8 ; VI-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_or_b32_sdwa v0, v32, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_or_b32_sdwa v1, v43, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -14461,6 +14479,7 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32 ; VI-NEXT: .LBB27_4: ; VI-NEXT: v_mov_b32_e32 v28, v44 ; VI-NEXT: v_mov_b32_e32 v26, v4 +; VI-NEXT: s_waitcnt vmcnt(8) ; VI-NEXT: v_mov_b32_e32 v33, v42 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; VI-NEXT: s_branch .LBB27_2 @@ -14497,22 +14516,22 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 ; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:4 ; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:16 -; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:12 ; GFX9-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:24 -; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:20 ; GFX9-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:32 -; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:28 ; GFX9-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:40 -; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:36 ; GFX9-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:48 -; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:44 ; GFX9-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:56 -; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:52 ; GFX9-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:64 -; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:60 ; GFX9-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:72 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:52 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:60 ; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:68 ; GFX9-NEXT: v_mov_b32_e32 v37, v30 ; GFX9-NEXT: v_lshlrev_b32_e32 v30, 8, v1 @@ -14536,22 +14555,22 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32 ; GFX9-NEXT: v_lshlrev_b32_e32 v21, 8, v2 ; GFX9-NEXT: s_waitcnt vmcnt(17) ; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v33 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX9-NEXT: s_waitcnt vmcnt(15) +; GFX9-NEXT: s_waitcnt vmcnt(16) ; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v6 -; GFX9-NEXT: s_waitcnt vmcnt(13) +; GFX9-NEXT: s_waitcnt vmcnt(15) ; GFX9-NEXT: v_lshlrev_b32_e32 v29, 8, v8 -; GFX9-NEXT: s_waitcnt vmcnt(11) +; GFX9-NEXT: s_waitcnt vmcnt(14) ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 8, v10 -; GFX9-NEXT: s_waitcnt vmcnt(9) +; GFX9-NEXT: s_waitcnt vmcnt(13) ; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v12 -; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_waitcnt vmcnt(12) ; GFX9-NEXT: v_lshlrev_b32_e32 v14, 8, v14 -; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: s_waitcnt vmcnt(11) ; GFX9-NEXT: v_lshlrev_b32_e32 v27, 8, v28 -; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: s_waitcnt vmcnt(10) ; GFX9-NEXT: v_lshlrev_b32_e32 v31, 8, v26 -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: s_waitcnt vmcnt(9) ; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v24 ; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill @@ -14580,19 +14599,27 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32 ; GFX9-NEXT: v_or_b32_sdwa v1, v36, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v37, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(13) ; GFX9-NEXT: v_or_b32_sdwa v1, v40, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(12) ; GFX9-NEXT: v_or_b32_sdwa v0, v55, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(11) ; GFX9-NEXT: v_or_b32_sdwa v1, v54, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(10) ; GFX9-NEXT: v_or_b32_sdwa v0, v53, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(9) ; GFX9-NEXT: v_or_b32_sdwa v1, v52, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(8) ; GFX9-NEXT: v_or_b32_sdwa v0, v42, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: v_or_b32_sdwa v1, v41, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_and_b32 s4, s28, 0xff ; GFX9-NEXT: s_lshl_b32 s5, s29, 8 ; GFX9-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_or_b32_sdwa v0, v32, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_or_b32_sdwa v1, v43, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -14807,6 +14834,7 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32 ; GFX9-NEXT: .LBB27_4: ; GFX9-NEXT: v_mov_b32_e32 v28, v44 ; GFX9-NEXT: v_mov_b32_e32 v26, v4 +; GFX9-NEXT: s_waitcnt vmcnt(8) ; GFX9-NEXT: v_mov_b32_e32 v33, v42 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX9-NEXT: s_branch .LBB27_2 @@ -14819,8 +14847,6 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v6 :: v_dual_mov_b32 v33, v4 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v2 :: v_dual_mov_b32 v31, v0 ; GFX11-TRUE16-NEXT: s_clause 0xf -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:56 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:52 ; GFX11-TRUE16-NEXT: scratch_load_b32 v2, off, s32 offset:60 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v4, off, s32 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v6, off, s32 offset:8 @@ -14829,6 +14855,8 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v12, off, s32 offset:32 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v14, off, s32 offset:40 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:52 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:44 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:36 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:28 @@ -14852,23 +14880,23 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 8, v29 ; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v0 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(13) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(14) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 8, v4 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(13) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v55, 8, v6 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(10) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v64, 8, v8 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(9) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v10 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(10) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v12 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(9) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v14 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v86 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v0 ; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB27_4 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false @@ -15204,8 +15232,6 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, v6 :: v_dual_mov_b32 v33, v4 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, v2 :: v_dual_mov_b32 v31, v0 ; GFX11-FAKE16-NEXT: s_clause 0xf -; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:56 -; GFX11-FAKE16-NEXT: scratch_load_u16 v39, off, s32 offset:52 ; GFX11-FAKE16-NEXT: scratch_load_b32 v2, off, s32 offset:60 ; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 ; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:8 @@ -15214,6 +15240,8 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32 ; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:32 ; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:40 ; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_load_u16 v39, off, s32 offset:52 ; GFX11-FAKE16-NEXT: scratch_load_u16 v48, off, s32 offset:44 ; GFX11-FAKE16-NEXT: scratch_load_u16 v49, off, s32 offset:36 ; GFX11-FAKE16-NEXT: scratch_load_u16 v50, off, s32 offset:28 @@ -15237,23 +15265,23 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 8, v29 ; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(15) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 8, v0 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(13) ; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(12) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(14) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v54, 8, v4 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(11) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(13) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 8, v6 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(10) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(12) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v64, 8, v8 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(9) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(11) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 8, v10 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(8) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(10) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 8, v12 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(7) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(9) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 8, v14 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(6) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(8) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 8, v86 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(7) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 8, v0 ; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo ; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB27_4 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false @@ -26438,17 +26466,17 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v44 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 ; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:44 ; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:36 ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 ; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_lshlrev_b32_e32 v50, 24, v58 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v47 @@ -27850,18 +27878,6 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_clause 0x1f -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:128 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:124 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:120 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:116 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:112 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:108 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:104 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:100 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:96 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:92 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:88 -; GFX11-TRUE16-NEXT: scratch_load_b32 v81, off, s32 offset:132 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:8 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:16 @@ -27873,6 +27889,18 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:64 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:72 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:80 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:88 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:96 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:104 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:112 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:120 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:128 +; GFX11-TRUE16-NEXT: scratch_load_b32 v81, off, s32 offset:132 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:124 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:116 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:108 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:100 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:92 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:84 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:76 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:68 @@ -27885,7 +27913,7 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:12 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:4 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(23) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.h, v29.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v22.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v20.l @@ -27916,12 +27944,6 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v80.h ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v64.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v64.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v65.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v65.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v66.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v66.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.h, 8, v67.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v67.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v68.l @@ -27933,6 +27955,16 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v71.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v71.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v80.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(21) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v66.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v66.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(19) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v65.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v65.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(17) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v64.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v64.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16) ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v81 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB50_3 @@ -28216,17 +28248,6 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, v6 :: v_dual_mov_b32 v33, v4 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, v2 :: v_dual_mov_b32 v31, v0 ; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:128 -; GFX11-FAKE16-NEXT: scratch_load_u16 v39, off, s32 offset:124 -; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:120 -; GFX11-FAKE16-NEXT: scratch_load_u16 v48, off, s32 offset:116 -; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:112 -; GFX11-FAKE16-NEXT: scratch_load_u16 v49, off, s32 offset:108 -; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:104 -; GFX11-FAKE16-NEXT: scratch_load_u16 v50, off, s32 offset:100 -; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:96 -; GFX11-FAKE16-NEXT: scratch_load_u16 v51, off, s32 offset:92 -; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:88 ; GFX11-FAKE16-NEXT: scratch_load_b32 v12, off, s32 offset:132 ; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 ; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:8 @@ -28239,6 +28260,17 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:64 ; GFX11-FAKE16-NEXT: scratch_load_u16 v130, off, s32 offset:72 ; GFX11-FAKE16-NEXT: scratch_load_u16 v131, off, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:88 +; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:96 +; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:104 +; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:112 +; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:120 +; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:128 +; GFX11-FAKE16-NEXT: scratch_load_u16 v39, off, s32 offset:124 +; GFX11-FAKE16-NEXT: scratch_load_u16 v48, off, s32 offset:116 +; GFX11-FAKE16-NEXT: scratch_load_u16 v49, off, s32 offset:108 +; GFX11-FAKE16-NEXT: scratch_load_u16 v50, off, s32 offset:100 +; GFX11-FAKE16-NEXT: scratch_load_u16 v51, off, s32 offset:92 ; GFX11-FAKE16-NEXT: scratch_load_u16 v52, off, s32 offset:84 ; GFX11-FAKE16-NEXT: scratch_load_u16 v53, off, s32 offset:76 ; GFX11-FAKE16-NEXT: scratch_load_u16 v54, off, s32 offset:68 @@ -28267,40 +28299,41 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v96, 8, v27 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v97, 8, v29 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(33) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v25, 8, v0 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(31) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v23, 8, v2 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(29) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v21, 8, v4 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(27) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v19, 8, v6 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(25) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v17, 8, v8 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(22) ; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(21) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(32) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v98, 8, v14 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(20) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(31) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v99, 8, v65 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(19) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(30) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v81, 8, v66 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(18) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(29) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v82, 8, v67 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(17) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(28) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v83, 8, v83 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(16) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(27) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v84, 8, v84 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(15) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(26) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v85, 8, v85 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(14) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(25) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v27, 8, v128 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(13) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(24) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v29, 8, v129 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(12) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(23) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v65, 8, v130 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(11) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(22) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v66, 8, v131 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(21) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v67, 8, v10 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(20) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v17, 8, v8 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(19) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v19, 8, v6 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(18) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v21, 8, v4 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(17) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v23, 8, v2 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(16) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v25, 8, v0 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -28723,22 +28756,22 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:36 ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:44 ; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:52 ; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:68 ; SI-NEXT: v_lshlrev_b32_e32 v50, 24, v1 ; SI-NEXT: v_lshlrev_b32_e32 v39, 8, v3 @@ -28760,21 +28793,19 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v2 ; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v4 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v6 -; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v52 -; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v10 -; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v12 -; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_lshlrev_b32_e32 v29, 8, v14 -; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_lshlrev_b32_e32 v14, 24, v28 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_lshlrev_b32_e32 v28, 8, v26 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v24 ; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill @@ -28829,33 +28860,41 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3 ; SI-NEXT: v_or_b32_e32 v1, v25, v1 ; SI-NEXT: v_or_b32_e32 v10, v0, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v30 +; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v42 ; SI-NEXT: v_or_b32_e32 v0, v0, v23 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v11, v1 ; SI-NEXT: v_or_b32_e32 v11, v0, v1 +; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_and_b32_e32 v0, 0xff, v41 +; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v40 ; SI-NEXT: v_or_b32_e32 v0, v0, v21 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v19, v1 ; SI-NEXT: v_or_b32_e32 v12, v0, v1 +; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_and_b32_e32 v0, 0xff, v35 +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v55 ; SI-NEXT: v_or_b32_e32 v0, v0, v17 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v13, v1 ; SI-NEXT: v_or_b32_e32 v13, v0, v1 +; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_and_b32_e32 v0, 0xff, v36 +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v53 ; SI-NEXT: v_or_b32_e32 v0, v0, v29 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v14, v1 ; SI-NEXT: v_or_b32_e32 v14, v0, v1 +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_and_b32_e32 v0, 0xff, v37 ; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v43 @@ -29131,6 +29170,7 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3 ; SI-NEXT: .LBB51_4: ; SI-NEXT: v_mov_b32_e32 v27, v44 ; SI-NEXT: v_mov_b32_e32 v26, v8 +; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_mov_b32_e32 v52, v42 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_branch .LBB51_2 @@ -29167,22 +29207,22 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 ; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:8 -; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:4 ; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:16 -; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:12 ; VI-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:24 -; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:20 ; VI-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:32 -; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:28 ; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:40 -; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:36 ; VI-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:48 -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:44 ; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:56 -; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:52 ; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:64 -; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:60 ; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:72 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:60 ; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:68 ; VI-NEXT: v_mov_b32_e32 v37, v30 ; VI-NEXT: v_lshlrev_b32_e32 v30, 8, v1 @@ -29204,21 +29244,19 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; VI-NEXT: v_lshlrev_b32_e32 v21, 8, v2 ; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v33 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_lshlrev_b32_e32 v19, 8, v6 -; VI-NEXT: s_waitcnt vmcnt(13) ; VI-NEXT: v_lshlrev_b32_e32 v29, 8, v8 -; VI-NEXT: s_waitcnt vmcnt(11) ; VI-NEXT: v_lshlrev_b32_e32 v33, 8, v10 -; VI-NEXT: s_waitcnt vmcnt(9) +; VI-NEXT: s_waitcnt vmcnt(13) ; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v12 -; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_waitcnt vmcnt(12) ; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v14 -; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: s_waitcnt vmcnt(11) ; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v28 -; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: s_waitcnt vmcnt(10) ; VI-NEXT: v_lshlrev_b32_e32 v31, 8, v26 -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(9) ; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v24 ; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill @@ -29247,19 +29285,27 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3 ; VI-NEXT: v_or_b32_sdwa v1, v36, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v37, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(13) ; VI-NEXT: v_or_b32_sdwa v1, v40, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(12) ; VI-NEXT: v_or_b32_sdwa v0, v55, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(11) ; VI-NEXT: v_or_b32_sdwa v1, v54, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(10) ; VI-NEXT: v_or_b32_sdwa v0, v53, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(9) ; VI-NEXT: v_or_b32_sdwa v1, v52, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(8) ; VI-NEXT: v_or_b32_sdwa v0, v42, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_or_b32_sdwa v1, v41, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s28, 0xff ; VI-NEXT: s_lshl_b32 s5, s29, 8 ; VI-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_or_b32_sdwa v0, v32, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_or_b32_sdwa v1, v43, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -29473,6 +29519,7 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3 ; VI-NEXT: .LBB51_4: ; VI-NEXT: v_mov_b32_e32 v28, v44 ; VI-NEXT: v_mov_b32_e32 v26, v4 +; VI-NEXT: s_waitcnt vmcnt(8) ; VI-NEXT: v_mov_b32_e32 v33, v42 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; VI-NEXT: s_branch .LBB51_2 @@ -29509,22 +29556,22 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 ; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:4 ; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:16 -; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:12 ; GFX9-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:24 -; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:20 ; GFX9-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:32 -; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:28 ; GFX9-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:40 -; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:36 ; GFX9-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:48 -; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:44 ; GFX9-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:56 -; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:52 ; GFX9-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:64 -; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:60 ; GFX9-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:72 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:52 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:60 ; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:68 ; GFX9-NEXT: v_mov_b32_e32 v37, v30 ; GFX9-NEXT: v_lshlrev_b32_e32 v30, 8, v1 @@ -29548,22 +29595,22 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3 ; GFX9-NEXT: v_lshlrev_b32_e32 v21, 8, v2 ; GFX9-NEXT: s_waitcnt vmcnt(17) ; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v33 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX9-NEXT: s_waitcnt vmcnt(15) +; GFX9-NEXT: s_waitcnt vmcnt(16) ; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v6 -; GFX9-NEXT: s_waitcnt vmcnt(13) +; GFX9-NEXT: s_waitcnt vmcnt(15) ; GFX9-NEXT: v_lshlrev_b32_e32 v29, 8, v8 -; GFX9-NEXT: s_waitcnt vmcnt(11) +; GFX9-NEXT: s_waitcnt vmcnt(14) ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 8, v10 -; GFX9-NEXT: s_waitcnt vmcnt(9) +; GFX9-NEXT: s_waitcnt vmcnt(13) ; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v12 -; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_waitcnt vmcnt(12) ; GFX9-NEXT: v_lshlrev_b32_e32 v14, 8, v14 -; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: s_waitcnt vmcnt(11) ; GFX9-NEXT: v_lshlrev_b32_e32 v27, 8, v28 -; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: s_waitcnt vmcnt(10) ; GFX9-NEXT: v_lshlrev_b32_e32 v31, 8, v26 -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: s_waitcnt vmcnt(9) ; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v24 ; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill @@ -29592,19 +29639,27 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3 ; GFX9-NEXT: v_or_b32_sdwa v1, v36, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v37, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(13) ; GFX9-NEXT: v_or_b32_sdwa v1, v40, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(12) ; GFX9-NEXT: v_or_b32_sdwa v0, v55, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(11) ; GFX9-NEXT: v_or_b32_sdwa v1, v54, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(10) ; GFX9-NEXT: v_or_b32_sdwa v0, v53, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(9) ; GFX9-NEXT: v_or_b32_sdwa v1, v52, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(8) ; GFX9-NEXT: v_or_b32_sdwa v0, v42, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: v_or_b32_sdwa v1, v41, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_and_b32 s4, s28, 0xff ; GFX9-NEXT: s_lshl_b32 s5, s29, 8 ; GFX9-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_or_b32_sdwa v0, v32, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_or_b32_sdwa v1, v43, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -29819,6 +29874,7 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3 ; GFX9-NEXT: .LBB51_4: ; GFX9-NEXT: v_mov_b32_e32 v28, v44 ; GFX9-NEXT: v_mov_b32_e32 v26, v4 +; GFX9-NEXT: s_waitcnt vmcnt(8) ; GFX9-NEXT: v_mov_b32_e32 v33, v42 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX9-NEXT: s_branch .LBB51_2 @@ -29831,8 +29887,6 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v6 :: v_dual_mov_b32 v33, v4 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v2 :: v_dual_mov_b32 v31, v0 ; GFX11-TRUE16-NEXT: s_clause 0xf -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:56 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:52 ; GFX11-TRUE16-NEXT: scratch_load_b32 v2, off, s32 offset:60 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v4, off, s32 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v6, off, s32 offset:8 @@ -29841,6 +29895,8 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v12, off, s32 offset:32 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v14, off, s32 offset:40 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:52 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:44 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:36 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:28 @@ -29864,23 +29920,23 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 8, v29 ; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v0 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(13) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(14) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 8, v4 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(13) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v55, 8, v6 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(10) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v64, 8, v8 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(9) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v10 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(10) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v12 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(9) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v14 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v86 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v0 ; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB51_4 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false @@ -30216,8 +30272,6 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, v6 :: v_dual_mov_b32 v33, v4 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, v2 :: v_dual_mov_b32 v31, v0 ; GFX11-FAKE16-NEXT: s_clause 0xf -; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:56 -; GFX11-FAKE16-NEXT: scratch_load_u16 v39, off, s32 offset:52 ; GFX11-FAKE16-NEXT: scratch_load_b32 v2, off, s32 offset:60 ; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 ; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:8 @@ -30226,6 +30280,8 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3 ; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:32 ; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:40 ; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_load_u16 v39, off, s32 offset:52 ; GFX11-FAKE16-NEXT: scratch_load_u16 v48, off, s32 offset:44 ; GFX11-FAKE16-NEXT: scratch_load_u16 v49, off, s32 offset:36 ; GFX11-FAKE16-NEXT: scratch_load_u16 v50, off, s32 offset:28 @@ -30249,23 +30305,23 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 8, v29 ; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(15) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 8, v0 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(13) ; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(12) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(14) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v54, 8, v4 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(11) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(13) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 8, v6 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(10) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(12) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v64, 8, v8 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(9) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(11) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 8, v10 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(8) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(10) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 8, v12 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(7) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(9) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 8, v14 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(6) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(8) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 8, v86 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(7) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 8, v0 ; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo ; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB51_4 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false @@ -40726,17 +40782,17 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v44 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 ; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:44 ; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:36 ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 ; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_lshlrev_b32_e32 v50, 24, v58 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v47 @@ -42138,18 +42194,6 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_clause 0x1f -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:128 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:124 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:120 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:116 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:112 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:108 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:104 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:100 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:96 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:92 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:88 -; GFX11-TRUE16-NEXT: scratch_load_b32 v81, off, s32 offset:132 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:8 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:16 @@ -42161,6 +42205,18 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:64 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:72 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:80 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:88 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:96 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:104 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:112 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:120 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:128 +; GFX11-TRUE16-NEXT: scratch_load_b32 v81, off, s32 offset:132 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:124 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:116 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:108 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:100 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:92 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:84 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:76 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:68 @@ -42173,7 +42229,7 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:12 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:4 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(23) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.h, v29.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v22.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v20.l @@ -42204,12 +42260,6 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v80.h ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v64.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v64.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v65.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v65.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v66.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v66.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.h, 8, v67.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v67.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v68.l @@ -42221,6 +42271,16 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v71.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v71.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v80.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(21) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v66.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v66.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(19) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v65.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v65.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(17) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v64.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v64.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16) ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v81 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB70_3 @@ -42504,17 +42564,6 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, v6 :: v_dual_mov_b32 v33, v4 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, v2 :: v_dual_mov_b32 v31, v0 ; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:128 -; GFX11-FAKE16-NEXT: scratch_load_u16 v39, off, s32 offset:124 -; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:120 -; GFX11-FAKE16-NEXT: scratch_load_u16 v48, off, s32 offset:116 -; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:112 -; GFX11-FAKE16-NEXT: scratch_load_u16 v49, off, s32 offset:108 -; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:104 -; GFX11-FAKE16-NEXT: scratch_load_u16 v50, off, s32 offset:100 -; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:96 -; GFX11-FAKE16-NEXT: scratch_load_u16 v51, off, s32 offset:92 -; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:88 ; GFX11-FAKE16-NEXT: scratch_load_b32 v12, off, s32 offset:132 ; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 ; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:8 @@ -42527,6 +42576,17 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:64 ; GFX11-FAKE16-NEXT: scratch_load_u16 v130, off, s32 offset:72 ; GFX11-FAKE16-NEXT: scratch_load_u16 v131, off, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:88 +; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:96 +; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:104 +; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:112 +; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:120 +; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:128 +; GFX11-FAKE16-NEXT: scratch_load_u16 v39, off, s32 offset:124 +; GFX11-FAKE16-NEXT: scratch_load_u16 v48, off, s32 offset:116 +; GFX11-FAKE16-NEXT: scratch_load_u16 v49, off, s32 offset:108 +; GFX11-FAKE16-NEXT: scratch_load_u16 v50, off, s32 offset:100 +; GFX11-FAKE16-NEXT: scratch_load_u16 v51, off, s32 offset:92 ; GFX11-FAKE16-NEXT: scratch_load_u16 v52, off, s32 offset:84 ; GFX11-FAKE16-NEXT: scratch_load_u16 v53, off, s32 offset:76 ; GFX11-FAKE16-NEXT: scratch_load_u16 v54, off, s32 offset:68 @@ -42555,40 +42615,41 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v96, 8, v27 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v97, 8, v29 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(33) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v25, 8, v0 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(31) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v23, 8, v2 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(29) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v21, 8, v4 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(27) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v19, 8, v6 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(25) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v17, 8, v8 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(22) ; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(21) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(32) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v98, 8, v14 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(20) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(31) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v99, 8, v65 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(19) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(30) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v81, 8, v66 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(18) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(29) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v82, 8, v67 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(17) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(28) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v83, 8, v83 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(16) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(27) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v84, 8, v84 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(15) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(26) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v85, 8, v85 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(14) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(25) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v27, 8, v128 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(13) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(24) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v29, 8, v129 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(12) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(23) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v65, 8, v130 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(11) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(22) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v66, 8, v131 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(21) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v67, 8, v10 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(20) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v17, 8, v8 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(19) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v19, 8, v6 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(18) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v21, 8, v4 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(17) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v23, 8, v2 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(16) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v25, 8, v0 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -43011,22 +43072,22 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:36 ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:44 ; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:52 ; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:68 ; SI-NEXT: v_lshlrev_b32_e32 v50, 24, v1 ; SI-NEXT: v_lshlrev_b32_e32 v39, 8, v3 @@ -43048,21 +43109,19 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v2 ; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v4 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v6 -; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v52 -; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v10 -; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v12 -; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_lshlrev_b32_e32 v29, 8, v14 -; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_lshlrev_b32_e32 v14, 24, v28 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_lshlrev_b32_e32 v28, 8, v26 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v24 ; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill @@ -43117,33 +43176,41 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in ; SI-NEXT: v_or_b32_e32 v1, v25, v1 ; SI-NEXT: v_or_b32_e32 v10, v0, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v30 +; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v42 ; SI-NEXT: v_or_b32_e32 v0, v0, v23 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v11, v1 ; SI-NEXT: v_or_b32_e32 v11, v0, v1 +; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_and_b32_e32 v0, 0xff, v41 +; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v40 ; SI-NEXT: v_or_b32_e32 v0, v0, v21 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v19, v1 ; SI-NEXT: v_or_b32_e32 v12, v0, v1 +; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_and_b32_e32 v0, 0xff, v35 +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v55 ; SI-NEXT: v_or_b32_e32 v0, v0, v17 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v13, v1 ; SI-NEXT: v_or_b32_e32 v13, v0, v1 +; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_and_b32_e32 v0, 0xff, v36 +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v53 ; SI-NEXT: v_or_b32_e32 v0, v0, v29 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v14, v1 ; SI-NEXT: v_or_b32_e32 v14, v0, v1 +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_and_b32_e32 v0, 0xff, v37 ; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v43 @@ -43419,6 +43486,7 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in ; SI-NEXT: .LBB71_4: ; SI-NEXT: v_mov_b32_e32 v27, v44 ; SI-NEXT: v_mov_b32_e32 v26, v8 +; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_mov_b32_e32 v52, v42 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_branch .LBB71_2 @@ -43455,22 +43523,22 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 ; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:8 -; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:4 ; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:16 -; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:12 ; VI-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:24 -; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:20 ; VI-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:32 -; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:28 ; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:40 -; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:36 ; VI-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:48 -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:44 ; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:56 -; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:52 ; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:64 -; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:60 ; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:72 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:60 ; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:68 ; VI-NEXT: v_mov_b32_e32 v37, v30 ; VI-NEXT: v_lshlrev_b32_e32 v30, 8, v1 @@ -43492,21 +43560,19 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; VI-NEXT: v_lshlrev_b32_e32 v21, 8, v2 ; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v33 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_lshlrev_b32_e32 v19, 8, v6 -; VI-NEXT: s_waitcnt vmcnt(13) ; VI-NEXT: v_lshlrev_b32_e32 v29, 8, v8 -; VI-NEXT: s_waitcnt vmcnt(11) ; VI-NEXT: v_lshlrev_b32_e32 v33, 8, v10 -; VI-NEXT: s_waitcnt vmcnt(9) +; VI-NEXT: s_waitcnt vmcnt(13) ; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v12 -; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_waitcnt vmcnt(12) ; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v14 -; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: s_waitcnt vmcnt(11) ; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v28 -; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: s_waitcnt vmcnt(10) ; VI-NEXT: v_lshlrev_b32_e32 v31, 8, v26 -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(9) ; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v24 ; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill @@ -43535,19 +43601,27 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in ; VI-NEXT: v_or_b32_sdwa v1, v36, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v37, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(13) ; VI-NEXT: v_or_b32_sdwa v1, v40, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(12) ; VI-NEXT: v_or_b32_sdwa v0, v55, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(11) ; VI-NEXT: v_or_b32_sdwa v1, v54, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(10) ; VI-NEXT: v_or_b32_sdwa v0, v53, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(9) ; VI-NEXT: v_or_b32_sdwa v1, v52, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(8) ; VI-NEXT: v_or_b32_sdwa v0, v42, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_or_b32_sdwa v1, v41, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s28, 0xff ; VI-NEXT: s_lshl_b32 s5, s29, 8 ; VI-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_or_b32_sdwa v0, v32, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_or_b32_sdwa v1, v43, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -43761,6 +43835,7 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in ; VI-NEXT: .LBB71_4: ; VI-NEXT: v_mov_b32_e32 v28, v44 ; VI-NEXT: v_mov_b32_e32 v26, v4 +; VI-NEXT: s_waitcnt vmcnt(8) ; VI-NEXT: v_mov_b32_e32 v33, v42 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; VI-NEXT: s_branch .LBB71_2 @@ -43797,22 +43872,22 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 ; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:4 ; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:16 -; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:12 ; GFX9-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:24 -; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:20 ; GFX9-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:32 -; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:28 ; GFX9-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:40 -; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:36 ; GFX9-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:48 -; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:44 ; GFX9-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:56 -; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:52 ; GFX9-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:64 -; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:60 ; GFX9-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:72 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:52 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:60 ; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:68 ; GFX9-NEXT: v_mov_b32_e32 v37, v30 ; GFX9-NEXT: v_lshlrev_b32_e32 v30, 8, v1 @@ -43836,22 +43911,22 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in ; GFX9-NEXT: v_lshlrev_b32_e32 v21, 8, v2 ; GFX9-NEXT: s_waitcnt vmcnt(17) ; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v33 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX9-NEXT: s_waitcnt vmcnt(15) +; GFX9-NEXT: s_waitcnt vmcnt(16) ; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v6 -; GFX9-NEXT: s_waitcnt vmcnt(13) +; GFX9-NEXT: s_waitcnt vmcnt(15) ; GFX9-NEXT: v_lshlrev_b32_e32 v29, 8, v8 -; GFX9-NEXT: s_waitcnt vmcnt(11) +; GFX9-NEXT: s_waitcnt vmcnt(14) ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 8, v10 -; GFX9-NEXT: s_waitcnt vmcnt(9) +; GFX9-NEXT: s_waitcnt vmcnt(13) ; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v12 -; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_waitcnt vmcnt(12) ; GFX9-NEXT: v_lshlrev_b32_e32 v14, 8, v14 -; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: s_waitcnt vmcnt(11) ; GFX9-NEXT: v_lshlrev_b32_e32 v27, 8, v28 -; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: s_waitcnt vmcnt(10) ; GFX9-NEXT: v_lshlrev_b32_e32 v31, 8, v26 -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: s_waitcnt vmcnt(9) ; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v24 ; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill @@ -43880,19 +43955,27 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in ; GFX9-NEXT: v_or_b32_sdwa v1, v36, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v37, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(13) ; GFX9-NEXT: v_or_b32_sdwa v1, v40, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(12) ; GFX9-NEXT: v_or_b32_sdwa v0, v55, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(11) ; GFX9-NEXT: v_or_b32_sdwa v1, v54, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(10) ; GFX9-NEXT: v_or_b32_sdwa v0, v53, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(9) ; GFX9-NEXT: v_or_b32_sdwa v1, v52, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(8) ; GFX9-NEXT: v_or_b32_sdwa v0, v42, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: v_or_b32_sdwa v1, v41, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_and_b32 s4, s28, 0xff ; GFX9-NEXT: s_lshl_b32 s5, s29, 8 ; GFX9-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_or_b32_sdwa v0, v32, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_or_b32_sdwa v1, v43, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -44107,6 +44190,7 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in ; GFX9-NEXT: .LBB71_4: ; GFX9-NEXT: v_mov_b32_e32 v28, v44 ; GFX9-NEXT: v_mov_b32_e32 v26, v4 +; GFX9-NEXT: s_waitcnt vmcnt(8) ; GFX9-NEXT: v_mov_b32_e32 v33, v42 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX9-NEXT: s_branch .LBB71_2 @@ -44119,8 +44203,6 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v6 :: v_dual_mov_b32 v33, v4 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v2 :: v_dual_mov_b32 v31, v0 ; GFX11-TRUE16-NEXT: s_clause 0xf -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:56 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:52 ; GFX11-TRUE16-NEXT: scratch_load_b32 v2, off, s32 offset:60 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v4, off, s32 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v6, off, s32 offset:8 @@ -44129,6 +44211,8 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v12, off, s32 offset:32 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v14, off, s32 offset:40 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:52 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:44 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:36 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:28 @@ -44152,23 +44236,23 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 8, v29 ; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v0 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(13) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(14) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 8, v4 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(13) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v55, 8, v6 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(10) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v64, 8, v8 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(9) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v10 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(10) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v12 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(9) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v14 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v86 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v0 ; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB71_4 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false @@ -44504,8 +44588,6 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, v6 :: v_dual_mov_b32 v33, v4 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, v2 :: v_dual_mov_b32 v31, v0 ; GFX11-FAKE16-NEXT: s_clause 0xf -; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:56 -; GFX11-FAKE16-NEXT: scratch_load_u16 v39, off, s32 offset:52 ; GFX11-FAKE16-NEXT: scratch_load_b32 v2, off, s32 offset:60 ; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 ; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:8 @@ -44514,6 +44596,8 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in ; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:32 ; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:40 ; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_load_u16 v39, off, s32 offset:52 ; GFX11-FAKE16-NEXT: scratch_load_u16 v48, off, s32 offset:44 ; GFX11-FAKE16-NEXT: scratch_load_u16 v49, off, s32 offset:36 ; GFX11-FAKE16-NEXT: scratch_load_u16 v50, off, s32 offset:28 @@ -44537,23 +44621,23 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 8, v29 ; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(15) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 8, v0 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(13) ; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(12) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(14) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v54, 8, v4 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(11) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(13) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 8, v6 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(10) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(12) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v64, 8, v8 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(9) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(11) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 8, v10 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(8) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(10) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 8, v12 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(7) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(9) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 8, v14 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(6) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(8) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 8, v86 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(7) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 8, v0 ; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo ; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB71_4 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false @@ -54174,17 +54258,17 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v44 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 ; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:44 ; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:36 ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 ; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_lshlrev_b32_e32 v50, 24, v58 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v47 @@ -55586,18 +55670,6 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_clause 0x1f -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:128 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:124 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:120 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:116 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:112 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:108 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:104 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:100 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:96 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:92 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:88 -; GFX11-TRUE16-NEXT: scratch_load_b32 v81, off, s32 offset:132 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:8 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:16 @@ -55609,6 +55681,18 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:64 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:72 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:80 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:88 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:96 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:104 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:112 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:120 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:128 +; GFX11-TRUE16-NEXT: scratch_load_b32 v81, off, s32 offset:132 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:124 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:116 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:108 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:100 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:92 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:84 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:76 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:68 @@ -55621,7 +55705,7 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:12 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:4 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(23) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.h, v29.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v22.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v20.l @@ -55652,12 +55736,6 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v80.h ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v64.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v64.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v65.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v65.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v66.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v66.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.h, 8, v67.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v67.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v68.l @@ -55669,6 +55747,16 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v71.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v71.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v80.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(21) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v66.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v66.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(19) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v65.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v65.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(17) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v64.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v64.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16) ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v81 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB86_3 @@ -55952,17 +56040,6 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, v6 :: v_dual_mov_b32 v33, v4 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, v2 :: v_dual_mov_b32 v31, v0 ; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:128 -; GFX11-FAKE16-NEXT: scratch_load_u16 v39, off, s32 offset:124 -; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:120 -; GFX11-FAKE16-NEXT: scratch_load_u16 v48, off, s32 offset:116 -; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:112 -; GFX11-FAKE16-NEXT: scratch_load_u16 v49, off, s32 offset:108 -; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:104 -; GFX11-FAKE16-NEXT: scratch_load_u16 v50, off, s32 offset:100 -; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:96 -; GFX11-FAKE16-NEXT: scratch_load_u16 v51, off, s32 offset:92 -; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:88 ; GFX11-FAKE16-NEXT: scratch_load_b32 v12, off, s32 offset:132 ; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 ; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:8 @@ -55975,6 +56052,17 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:64 ; GFX11-FAKE16-NEXT: scratch_load_u16 v130, off, s32 offset:72 ; GFX11-FAKE16-NEXT: scratch_load_u16 v131, off, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:88 +; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:96 +; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:104 +; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:112 +; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:120 +; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:128 +; GFX11-FAKE16-NEXT: scratch_load_u16 v39, off, s32 offset:124 +; GFX11-FAKE16-NEXT: scratch_load_u16 v48, off, s32 offset:116 +; GFX11-FAKE16-NEXT: scratch_load_u16 v49, off, s32 offset:108 +; GFX11-FAKE16-NEXT: scratch_load_u16 v50, off, s32 offset:100 +; GFX11-FAKE16-NEXT: scratch_load_u16 v51, off, s32 offset:92 ; GFX11-FAKE16-NEXT: scratch_load_u16 v52, off, s32 offset:84 ; GFX11-FAKE16-NEXT: scratch_load_u16 v53, off, s32 offset:76 ; GFX11-FAKE16-NEXT: scratch_load_u16 v54, off, s32 offset:68 @@ -56003,40 +56091,41 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v96, 8, v27 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v97, 8, v29 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(33) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v25, 8, v0 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(31) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v23, 8, v2 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(29) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v21, 8, v4 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(27) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v19, 8, v6 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(25) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v17, 8, v8 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(22) ; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(21) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(32) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v98, 8, v14 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(20) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(31) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v99, 8, v65 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(19) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(30) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v81, 8, v66 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(18) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(29) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v82, 8, v67 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(17) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(28) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v83, 8, v83 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(16) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(27) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v84, 8, v84 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(15) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(26) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v85, 8, v85 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(14) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(25) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v27, 8, v128 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(13) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(24) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v29, 8, v129 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(12) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(23) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v65, 8, v130 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(11) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(22) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v66, 8, v131 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(21) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v67, 8, v10 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(20) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v17, 8, v8 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(19) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v19, 8, v6 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(18) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v21, 8, v4 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(17) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v23, 8, v2 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(16) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v25, 8, v0 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -56459,22 +56548,22 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:36 ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:44 ; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:52 ; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:68 ; SI-NEXT: v_lshlrev_b32_e32 v50, 24, v1 ; SI-NEXT: v_lshlrev_b32_e32 v39, 8, v3 @@ -56496,21 +56585,19 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v2 ; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v4 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v6 -; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v52 -; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v10 -; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v12 -; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_lshlrev_b32_e32 v29, 8, v14 -; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_lshlrev_b32_e32 v14, 24, v28 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_lshlrev_b32_e32 v28, 8, v26 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v24 ; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill @@ -56565,33 +56652,41 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: v_or_b32_e32 v1, v25, v1 ; SI-NEXT: v_or_b32_e32 v10, v0, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v30 +; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v42 ; SI-NEXT: v_or_b32_e32 v0, v0, v23 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v11, v1 ; SI-NEXT: v_or_b32_e32 v11, v0, v1 +; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_and_b32_e32 v0, 0xff, v41 +; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v40 ; SI-NEXT: v_or_b32_e32 v0, v0, v21 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v19, v1 ; SI-NEXT: v_or_b32_e32 v12, v0, v1 +; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_and_b32_e32 v0, 0xff, v35 +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v55 ; SI-NEXT: v_or_b32_e32 v0, v0, v17 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v13, v1 ; SI-NEXT: v_or_b32_e32 v13, v0, v1 +; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_and_b32_e32 v0, 0xff, v36 +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v53 ; SI-NEXT: v_or_b32_e32 v0, v0, v29 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v14, v1 ; SI-NEXT: v_or_b32_e32 v14, v0, v1 +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_and_b32_e32 v0, 0xff, v37 ; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v43 @@ -56867,6 +56962,7 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: .LBB87_4: ; SI-NEXT: v_mov_b32_e32 v27, v44 ; SI-NEXT: v_mov_b32_e32 v26, v8 +; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_mov_b32_e32 v52, v42 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_branch .LBB87_2 @@ -56903,22 +56999,22 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 ; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:8 -; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:4 ; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:16 -; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:12 ; VI-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:24 -; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:20 ; VI-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:32 -; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:28 ; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:40 -; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:36 ; VI-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:48 -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:44 ; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:56 -; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:52 ; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:64 -; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:60 ; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:72 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:60 ; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:68 ; VI-NEXT: v_mov_b32_e32 v37, v30 ; VI-NEXT: v_lshlrev_b32_e32 v30, 8, v1 @@ -56940,21 +57036,19 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; VI-NEXT: v_lshlrev_b32_e32 v21, 8, v2 ; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v33 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_lshlrev_b32_e32 v19, 8, v6 -; VI-NEXT: s_waitcnt vmcnt(13) ; VI-NEXT: v_lshlrev_b32_e32 v29, 8, v8 -; VI-NEXT: s_waitcnt vmcnt(11) ; VI-NEXT: v_lshlrev_b32_e32 v33, 8, v10 -; VI-NEXT: s_waitcnt vmcnt(9) +; VI-NEXT: s_waitcnt vmcnt(13) ; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v12 -; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_waitcnt vmcnt(12) ; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v14 -; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: s_waitcnt vmcnt(11) ; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v28 -; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: s_waitcnt vmcnt(10) ; VI-NEXT: v_lshlrev_b32_e32 v31, 8, v26 -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(9) ; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v24 ; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill @@ -56983,19 +57077,27 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32 ; VI-NEXT: v_or_b32_sdwa v1, v36, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v37, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(13) ; VI-NEXT: v_or_b32_sdwa v1, v40, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(12) ; VI-NEXT: v_or_b32_sdwa v0, v55, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(11) ; VI-NEXT: v_or_b32_sdwa v1, v54, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(10) ; VI-NEXT: v_or_b32_sdwa v0, v53, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(9) ; VI-NEXT: v_or_b32_sdwa v1, v52, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(8) ; VI-NEXT: v_or_b32_sdwa v0, v42, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_or_b32_sdwa v1, v41, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s28, 0xff ; VI-NEXT: s_lshl_b32 s5, s29, 8 ; VI-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_or_b32_sdwa v0, v32, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_or_b32_sdwa v1, v43, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -57209,6 +57311,7 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32 ; VI-NEXT: .LBB87_4: ; VI-NEXT: v_mov_b32_e32 v28, v44 ; VI-NEXT: v_mov_b32_e32 v26, v4 +; VI-NEXT: s_waitcnt vmcnt(8) ; VI-NEXT: v_mov_b32_e32 v33, v42 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; VI-NEXT: s_branch .LBB87_2 @@ -57245,22 +57348,22 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 ; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:4 ; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:16 -; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:12 ; GFX9-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:24 -; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:20 ; GFX9-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:32 -; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:28 ; GFX9-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:40 -; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:36 ; GFX9-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:48 -; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:44 ; GFX9-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:56 -; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:52 ; GFX9-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:64 -; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:60 ; GFX9-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:72 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:52 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:60 ; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:68 ; GFX9-NEXT: v_mov_b32_e32 v37, v30 ; GFX9-NEXT: v_lshlrev_b32_e32 v30, 8, v1 @@ -57284,22 +57387,22 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32 ; GFX9-NEXT: v_lshlrev_b32_e32 v21, 8, v2 ; GFX9-NEXT: s_waitcnt vmcnt(17) ; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v33 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX9-NEXT: s_waitcnt vmcnt(15) +; GFX9-NEXT: s_waitcnt vmcnt(16) ; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v6 -; GFX9-NEXT: s_waitcnt vmcnt(13) +; GFX9-NEXT: s_waitcnt vmcnt(15) ; GFX9-NEXT: v_lshlrev_b32_e32 v29, 8, v8 -; GFX9-NEXT: s_waitcnt vmcnt(11) +; GFX9-NEXT: s_waitcnt vmcnt(14) ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 8, v10 -; GFX9-NEXT: s_waitcnt vmcnt(9) +; GFX9-NEXT: s_waitcnt vmcnt(13) ; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v12 -; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_waitcnt vmcnt(12) ; GFX9-NEXT: v_lshlrev_b32_e32 v14, 8, v14 -; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: s_waitcnt vmcnt(11) ; GFX9-NEXT: v_lshlrev_b32_e32 v27, 8, v28 -; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: s_waitcnt vmcnt(10) ; GFX9-NEXT: v_lshlrev_b32_e32 v31, 8, v26 -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: s_waitcnt vmcnt(9) ; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v24 ; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill @@ -57328,19 +57431,27 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32 ; GFX9-NEXT: v_or_b32_sdwa v1, v36, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v37, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(13) ; GFX9-NEXT: v_or_b32_sdwa v1, v40, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(12) ; GFX9-NEXT: v_or_b32_sdwa v0, v55, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(11) ; GFX9-NEXT: v_or_b32_sdwa v1, v54, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(10) ; GFX9-NEXT: v_or_b32_sdwa v0, v53, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(9) ; GFX9-NEXT: v_or_b32_sdwa v1, v52, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(8) ; GFX9-NEXT: v_or_b32_sdwa v0, v42, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: v_or_b32_sdwa v1, v41, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_and_b32 s4, s28, 0xff ; GFX9-NEXT: s_lshl_b32 s5, s29, 8 ; GFX9-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_or_b32_sdwa v0, v32, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_or_b32_sdwa v1, v43, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -57555,6 +57666,7 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32 ; GFX9-NEXT: .LBB87_4: ; GFX9-NEXT: v_mov_b32_e32 v28, v44 ; GFX9-NEXT: v_mov_b32_e32 v26, v4 +; GFX9-NEXT: s_waitcnt vmcnt(8) ; GFX9-NEXT: v_mov_b32_e32 v33, v42 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX9-NEXT: s_branch .LBB87_2 @@ -57567,8 +57679,6 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v6 :: v_dual_mov_b32 v33, v4 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v2 :: v_dual_mov_b32 v31, v0 ; GFX11-TRUE16-NEXT: s_clause 0xf -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:56 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:52 ; GFX11-TRUE16-NEXT: scratch_load_b32 v2, off, s32 offset:60 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v4, off, s32 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v6, off, s32 offset:8 @@ -57577,6 +57687,8 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v12, off, s32 offset:32 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v14, off, s32 offset:40 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:52 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:44 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:36 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:28 @@ -57600,23 +57712,23 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 8, v29 ; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v0 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(13) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(14) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 8, v4 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(13) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v55, 8, v6 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(10) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v64, 8, v8 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(9) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v10 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(10) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v12 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(9) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v14 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v86 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v0 ; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB87_4 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false @@ -57952,8 +58064,6 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, v6 :: v_dual_mov_b32 v33, v4 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, v2 :: v_dual_mov_b32 v31, v0 ; GFX11-FAKE16-NEXT: s_clause 0xf -; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:56 -; GFX11-FAKE16-NEXT: scratch_load_u16 v39, off, s32 offset:52 ; GFX11-FAKE16-NEXT: scratch_load_b32 v2, off, s32 offset:60 ; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 ; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:8 @@ -57962,6 +58072,8 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32 ; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:32 ; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:40 ; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_load_u16 v39, off, s32 offset:52 ; GFX11-FAKE16-NEXT: scratch_load_u16 v48, off, s32 offset:44 ; GFX11-FAKE16-NEXT: scratch_load_u16 v49, off, s32 offset:36 ; GFX11-FAKE16-NEXT: scratch_load_u16 v50, off, s32 offset:28 @@ -57985,23 +58097,23 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 8, v29 ; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(15) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 8, v0 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(13) ; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(12) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(14) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v54, 8, v4 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(11) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(13) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 8, v6 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(10) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(12) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v64, 8, v8 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(9) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(11) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 8, v10 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(8) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(10) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 8, v12 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(7) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(9) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 8, v14 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(6) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(8) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 8, v86 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(7) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 8, v0 ; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo ; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB87_4 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false @@ -67744,6 +67856,8 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:36 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 @@ -67767,7 +67881,6 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:84 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:36 ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 @@ -67784,6 +67897,9 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:124 ; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; SI-NEXT: v_lshlrev_b32_e32 v16, 24, v4 ; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v10 @@ -67792,21 +67908,21 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v26, 8, v26 ; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v31 ; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v32 +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_lshlrev_b32_e32 v44, 8, v33 -; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v34 -; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_lshlrev_b32_e32 v56, 24, v35 -; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_lshlrev_b32_e32 v60, 8, v36 -; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_lshlrev_b32_e32 v59, 24, v37 -; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_lshlrev_b32_e32 v61, 24, v38 ; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v25 -; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v39 -; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_lshlrev_b32_e32 v45, 8, v48 ; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr48 @@ -67819,10 +67935,6 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -69388,18 +69500,6 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_clause 0x1f -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:128 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:124 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:120 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:116 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v53, off, s32 offset:112 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:108 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:104 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:100 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v52, off, s32 offset:96 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:92 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v54, off, s32 offset:88 -; GFX11-TRUE16-NEXT: scratch_load_b32 v66, off, s32 offset:132 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v39, off, s32 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:8 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v48, off, s32 offset:16 @@ -69411,6 +69511,18 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:64 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:72 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:80 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v54, off, s32 offset:88 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v52, off, s32 offset:96 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:104 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v53, off, s32 offset:112 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:120 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:128 +; GFX11-TRUE16-NEXT: scratch_load_b32 v66, off, s32 offset:132 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:124 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:116 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:108 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:100 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:92 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:84 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:76 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:68 @@ -69426,7 +69538,7 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v29.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v27.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v25.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(21) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v23.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v21.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v19.l @@ -69459,27 +69571,30 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.l, 8, v29.h ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.h, 8, v55.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v55.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v53.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v53.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v52.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(14) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v54.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.h, 8, v39.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.h, 8, v48.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.h, 8, v48.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v50.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.h, 8, v50.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(28) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v51.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(21) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v52.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.l, 8, v54.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v64.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v64.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v65.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v54.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v52.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(19) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v53.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v53.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(17) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v55.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.h, 8, v55.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16) ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v66 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB98_3 @@ -69764,17 +69879,6 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v38, v6 :: v_dual_mov_b32 v33, v4 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v37, v2 :: v_dual_mov_b32 v36, v0 ; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:128 -; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:124 -; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:120 -; GFX11-FAKE16-NEXT: scratch_load_u16 v70, off, s32 offset:116 -; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:112 -; GFX11-FAKE16-NEXT: scratch_load_u16 v66, off, s32 offset:108 -; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:104 -; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:100 -; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:96 -; GFX11-FAKE16-NEXT: scratch_load_u16 v64, off, s32 offset:92 -; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:88 ; GFX11-FAKE16-NEXT: scratch_load_b32 v12, off, s32 offset:132 ; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 ; GFX11-FAKE16-NEXT: scratch_load_u16 v96, off, s32 offset:8 @@ -69787,6 +69891,17 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_load_u16 v115, off, s32 offset:64 ; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:72 ; GFX11-FAKE16-NEXT: scratch_load_u16 v117, off, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:88 +; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:96 +; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:104 +; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:112 +; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:120 +; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:128 +; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:124 +; GFX11-FAKE16-NEXT: scratch_load_u16 v70, off, s32 offset:116 +; GFX11-FAKE16-NEXT: scratch_load_u16 v66, off, s32 offset:108 +; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:100 +; GFX11-FAKE16-NEXT: scratch_load_u16 v64, off, s32 offset:92 ; GFX11-FAKE16-NEXT: scratch_load_u16 v99, off, s32 offset:84 ; GFX11-FAKE16-NEXT: scratch_load_u16 v85, off, s32 offset:76 ; GFX11-FAKE16-NEXT: scratch_load_u16 v97, off, s32 offset:68 @@ -69815,37 +69930,41 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v19, 8, v27 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v27, 8, v29 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(33) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v119, 8, v0 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(31) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v118, 8, v2 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(22) ; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(21) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(32) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v23, 8, v14 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(20) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(31) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v98, 8, v96 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(19) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(30) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v25, 8, v100 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(18) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(29) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v96, 8, v101 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(17) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(28) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v29, 8, v102 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(16) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(27) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v112, 8, v103 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(15) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(26) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v103, 8, v113 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(14) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(25) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v101, 8, v114 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(13) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(24) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v100, 8, v115 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(12) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(23) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v114, 8, v116 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(11) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(22) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v102, 8, v117 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(21) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v117, 8, v10 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(20) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v113, 8, v8 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(19) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v116, 8, v6 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(18) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v115, 8, v4 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(17) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v118, 8, v2 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(16) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v119, 8, v0 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -70007,10 +70126,13 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB98_2 ; GFX11-FAKE16-NEXT: .LBB98_4: ; %cmp.true +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(14) ; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v70, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v67, 3 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(12) ; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v69, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v66, 3 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(11) ; GFX11-FAKE16-NEXT: v_add_nc_u16 v5, v64, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 @@ -70200,24 +70322,24 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: v_mov_b32_e32 v46, v30 ; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:76 ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:24 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:44 ; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:52 ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:36 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:68 ; SI-NEXT: v_readfirstlane_b32 s43, v1 ; SI-NEXT: v_readfirstlane_b32 s42, v0 @@ -70239,25 +70361,26 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v35 ; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v38 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v36 ; SI-NEXT: v_lshlrev_b32_e32 v19, 8, v48 -; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v39 -; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_lshlrev_b32_e32 v61, 8, v37 -; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v49 -; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_lshlrev_b32_e32 v25, 8, v30 -; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_lshlrev_b32_e32 v30, 24, v31 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_lshlrev_b32_e32 v38, 8, v33 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_lshlrev_b32_e32 v29, 24, v34 +; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill @@ -70278,6 +70401,7 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: v_or_b32_e32 v1, v1, v40 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(10) expcnt(0) @@ -70773,6 +70897,7 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB99_4: +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_mov_b32_e32 v39, v32 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $sgpr6 @@ -70841,22 +70966,22 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 ; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:8 -; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:4 ; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:16 -; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:12 ; VI-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:24 -; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:20 ; VI-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:32 -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:28 ; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:40 -; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:36 ; VI-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:48 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:44 ; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:56 -; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:52 ; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:64 -; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:60 ; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:72 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:60 ; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:68 ; VI-NEXT: v_lshlrev_b32_e32 v50, 8, v1 ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v3 @@ -70881,19 +71006,15 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; VI-NEXT: v_lshlrev_b32_e32 v46, 8, v28 ; VI-NEXT: v_lshlrev_b32_e32 v56, 8, v4 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_lshlrev_b32_e32 v58, 8, v6 ; VI-NEXT: v_lshlrev_b32_e32 v59, 8, v8 ; VI-NEXT: v_lshlrev_b32_e32 v60, 8, v10 -; VI-NEXT: s_waitcnt vmcnt(13) ; VI-NEXT: v_lshlrev_b32_e32 v61, 8, v12 -; VI-NEXT: s_waitcnt vmcnt(11) +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_lshlrev_b32_e32 v62, 8, v14 -; VI-NEXT: s_waitcnt vmcnt(9) ; VI-NEXT: v_lshlrev_b32_e32 v63, 8, v22 -; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_lshlrev_b32_e32 v33, 8, v33 -; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: s_waitcnt vmcnt(13) ; VI-NEXT: v_lshlrev_b32_e32 v22, 8, v24 ; VI-NEXT: s_cbranch_scc0 .LBB99_4 ; VI-NEXT: ; %bb.1: ; %cmp.false @@ -70916,19 +71037,27 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; VI-NEXT: v_or_b32_sdwa v1, v38, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v31, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(12) ; VI-NEXT: v_or_b32_sdwa v1, v52, v56 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(11) ; VI-NEXT: v_or_b32_sdwa v0, v54, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(10) ; VI-NEXT: v_or_b32_sdwa v1, v41, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(9) ; VI-NEXT: v_or_b32_sdwa v0, v42, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(8) ; VI-NEXT: v_or_b32_sdwa v1, v43, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_or_b32_sdwa v0, v44, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_or_b32_sdwa v1, v45, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s28, 0xff ; VI-NEXT: s_lshl_b32 s5, s29, 8 ; VI-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_or_b32_sdwa v0, v47, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_or_b32_sdwa v1, v57, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -70975,6 +71104,7 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; VI-NEXT: s_cbranch_execnz .LBB99_3 ; VI-NEXT: .LBB99_2: ; %cmp.true ; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(8) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v44 ; VI-NEXT: v_or_b32_sdwa v3, v62, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v14, vcc, 0x300, v3 @@ -71181,22 +71311,22 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:4 ; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:16 -; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:12 ; GFX9-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:24 -; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:20 ; GFX9-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:32 -; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:28 ; GFX9-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:40 -; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:36 ; GFX9-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:48 -; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:44 ; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:56 -; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:52 ; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:64 -; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:60 ; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:72 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:52 +; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:60 ; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:68 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX9-NEXT: v_lshlrev_b32_e32 v50, 8, v3 @@ -71222,22 +71352,22 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; GFX9-NEXT: v_lshlrev_b32_e32 v47, 8, v2 ; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b32_e32 v29, 8, v4 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX9-NEXT: s_waitcnt vmcnt(18) +; GFX9-NEXT: s_waitcnt vmcnt(19) ; GFX9-NEXT: v_lshlrev_b32_e32 v59, 8, v6 -; GFX9-NEXT: s_waitcnt vmcnt(16) +; GFX9-NEXT: s_waitcnt vmcnt(18) ; GFX9-NEXT: v_lshlrev_b32_e32 v58, 8, v8 -; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: s_waitcnt vmcnt(17) ; GFX9-NEXT: v_lshlrev_b32_e32 v61, 8, v10 -; GFX9-NEXT: s_waitcnt vmcnt(12) +; GFX9-NEXT: s_waitcnt vmcnt(16) ; GFX9-NEXT: v_lshlrev_b32_e32 v60, 8, v12 -; GFX9-NEXT: s_waitcnt vmcnt(10) +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_waitcnt vmcnt(15) ; GFX9-NEXT: v_lshlrev_b32_e32 v63, 8, v14 -; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: s_waitcnt vmcnt(14) ; GFX9-NEXT: v_lshlrev_b32_e32 v62, 8, v36 -; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: s_waitcnt vmcnt(13) ; GFX9-NEXT: v_lshlrev_b32_e32 v38, 8, v38 -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: s_waitcnt vmcnt(12) ; GFX9-NEXT: v_lshlrev_b32_e32 v36, 8, v31 ; GFX9-NEXT: s_cbranch_scc0 .LBB99_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false @@ -71278,38 +71408,46 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; GFX9-NEXT: s_and_b32 s4, s16, 0xff ; GFX9-NEXT: s_lshl_b32 s5, s17, 8 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(11) ; GFX9-NEXT: v_or_b32_sdwa v1, v52, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s4, s4, s5 ; GFX9-NEXT: s_and_b32 s5, s18, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s19, 8 ; GFX9-NEXT: v_lshl_or_b32 v11, v1, 16, v0 +; GFX9-NEXT: s_waitcnt vmcnt(10) ; GFX9-NEXT: v_or_b32_sdwa v0, v41, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s5, s5, s6 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(9) ; GFX9-NEXT: v_or_b32_sdwa v1, v40, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 ; GFX9-NEXT: s_and_b32 s5, s20, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s21, 8 ; GFX9-NEXT: v_lshl_or_b32 v12, v1, 16, v0 +; GFX9-NEXT: s_waitcnt vmcnt(8) ; GFX9-NEXT: v_or_b32_sdwa v0, v44, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s5, s5, s6 ; GFX9-NEXT: s_and_b32 s6, s22, 0xff ; GFX9-NEXT: s_lshl_b32 s7, s23, 8 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: v_or_b32_sdwa v1, v43, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s6, s6, s7 ; GFX9-NEXT: v_lshl_or_b32 v13, v1, 16, v0 +; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_or_b32_sdwa v0, v46, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s6 ; GFX9-NEXT: s_and_b32 s6, s24, 0xff ; GFX9-NEXT: s_lshl_b32 s7, s25, 8 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_or_b32_sdwa v1, v45, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s6, s6, s7 ; GFX9-NEXT: s_and_b32 s7, s26, 0xff ; GFX9-NEXT: s_lshl_b32 s8, s27, 8 ; GFX9-NEXT: v_or_b32_sdwa v2, v26, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v14, v1, 16, v0 +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_or_b32_sdwa v0, v57, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s7, s7, s8 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -71324,6 +71462,7 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: s_cbranch_execnz .LBB99_3 ; GFX9-NEXT: .LBB99_2: ; %cmp.true +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_add_u32_e32 v3, 3, v45 ; GFX9-NEXT: v_or_b32_sdwa v3, v62, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v14, 0x300, v3 @@ -71410,6 +71549,7 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; GFX9-NEXT: s_and_b32 s9, s16, 0xff ; GFX9-NEXT: s_lshl_b32 s10, s17, 8 ; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v57 ; GFX9-NEXT: v_add_u32_e32 v2, 3, v46 ; GFX9-NEXT: s_or_b32 s9, s10, s9 @@ -71505,8 +71645,6 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v10 :: v_dual_mov_b32 v35, v0 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, v4 :: v_dual_mov_b32 v37, v2 ; GFX11-TRUE16-NEXT: s_clause 0xf -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:56 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:52 ; GFX11-TRUE16-NEXT: scratch_load_b32 v2, off, s32 offset:60 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v4, off, s32 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v6, off, s32 offset:8 @@ -71515,6 +71653,8 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v12, off, s32 offset:32 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v14, off, s32 offset:40 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:52 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:44 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:36 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:28 @@ -71538,23 +71678,23 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v29 ; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v85, 8, v0 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(13) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(14) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v66, 8, v4 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(13) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 8, v6 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(10) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v70, 8, v8 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(9) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v71, 8, v10 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(10) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v83, 8, v12 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(9) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 8, v14 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v84, 8, v84 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v85, 8, v0 ; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB99_4 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false @@ -71697,6 +71837,7 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300 ; GFX11-TRUE16-NEXT: s_addk_i32 s10, 0x300 ; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6) ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v64 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10 @@ -71831,8 +71972,6 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, v10 :: v_dual_mov_b32 v35, v0 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v38, v4 :: v_dual_mov_b32 v37, v2 ; GFX11-FAKE16-NEXT: s_clause 0xf -; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:56 -; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:52 ; GFX11-FAKE16-NEXT: scratch_load_b32 v2, off, s32 offset:60 ; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 ; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:8 @@ -71841,6 +71980,8 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:32 ; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:40 ; GFX11-FAKE16-NEXT: scratch_load_u16 v84, off, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:52 ; GFX11-FAKE16-NEXT: scratch_load_u16 v82, off, s32 offset:44 ; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:36 ; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:28 @@ -71864,23 +72005,23 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 8, v29 ; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(15) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v85, 8, v0 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(13) ; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(12) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(14) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v66, 8, v4 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(11) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(13) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 8, v6 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(10) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(12) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v70, 8, v8 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(9) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(11) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v71, 8, v10 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(8) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(10) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v83, 8, v12 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(7) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(9) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v81, 8, v14 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(6) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(8) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v84, 8, v84 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(7) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v85, 8, v0 ; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo ; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB99_4 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false @@ -81810,18 +81951,6 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_clause 0x1f -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:128 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:124 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:120 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:116 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v53, off, s32 offset:112 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:108 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:104 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:100 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v52, off, s32 offset:96 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:92 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v54, off, s32 offset:88 -; GFX11-TRUE16-NEXT: scratch_load_b32 v66, off, s32 offset:132 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v39, off, s32 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:8 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v48, off, s32 offset:16 @@ -81833,6 +81962,18 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:64 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:72 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:80 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v54, off, s32 offset:88 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v52, off, s32 offset:96 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:104 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v53, off, s32 offset:112 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:120 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:128 +; GFX11-TRUE16-NEXT: scratch_load_b32 v66, off, s32 offset:132 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:124 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:116 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:108 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:100 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:92 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:84 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:76 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:68 @@ -81848,7 +81989,7 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v29.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v27.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v25.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(21) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v23.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v21.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v19.l @@ -81881,27 +82022,30 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.l, 8, v29.h ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.h, 8, v55.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v55.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v53.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v53.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v52.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(14) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v54.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.h, 8, v39.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.h, 8, v48.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.h, 8, v48.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v50.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.h, 8, v50.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(28) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v51.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(21) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v52.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.l, 8, v54.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v64.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v64.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v65.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v54.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v52.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(19) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v53.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v53.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(17) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v55.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.h, 8, v55.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16) ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v66 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB106_3 @@ -82186,17 +82330,6 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v38, v6 :: v_dual_mov_b32 v33, v4 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v37, v2 :: v_dual_mov_b32 v36, v0 ; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:128 -; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:124 -; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:120 -; GFX11-FAKE16-NEXT: scratch_load_u16 v70, off, s32 offset:116 -; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:112 -; GFX11-FAKE16-NEXT: scratch_load_u16 v66, off, s32 offset:108 -; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:104 -; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:100 -; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:96 -; GFX11-FAKE16-NEXT: scratch_load_u16 v64, off, s32 offset:92 -; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:88 ; GFX11-FAKE16-NEXT: scratch_load_b32 v12, off, s32 offset:132 ; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 ; GFX11-FAKE16-NEXT: scratch_load_u16 v96, off, s32 offset:8 @@ -82209,6 +82342,17 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_load_u16 v115, off, s32 offset:64 ; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:72 ; GFX11-FAKE16-NEXT: scratch_load_u16 v117, off, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:88 +; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:96 +; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:104 +; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:112 +; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:120 +; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:128 +; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:124 +; GFX11-FAKE16-NEXT: scratch_load_u16 v70, off, s32 offset:116 +; GFX11-FAKE16-NEXT: scratch_load_u16 v66, off, s32 offset:108 +; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:100 +; GFX11-FAKE16-NEXT: scratch_load_u16 v64, off, s32 offset:92 ; GFX11-FAKE16-NEXT: scratch_load_u16 v99, off, s32 offset:84 ; GFX11-FAKE16-NEXT: scratch_load_u16 v85, off, s32 offset:76 ; GFX11-FAKE16-NEXT: scratch_load_u16 v97, off, s32 offset:68 @@ -82237,37 +82381,41 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v19, 8, v27 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v27, 8, v29 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(33) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v119, 8, v0 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(31) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v118, 8, v2 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(22) ; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(21) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(32) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v23, 8, v14 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(20) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(31) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v98, 8, v96 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(19) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(30) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v25, 8, v100 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(18) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(29) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v96, 8, v101 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(17) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(28) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v29, 8, v102 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(16) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(27) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v112, 8, v103 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(15) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(26) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v103, 8, v113 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(14) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(25) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v101, 8, v114 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(13) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(24) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v100, 8, v115 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(12) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(23) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v114, 8, v116 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(11) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(22) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v102, 8, v117 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(21) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v117, 8, v10 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(20) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v113, 8, v8 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(19) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v116, 8, v6 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(18) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v115, 8, v4 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(17) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v118, 8, v2 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(16) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v119, 8, v0 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -82429,10 +82577,13 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB106_2 ; GFX11-FAKE16-NEXT: .LBB106_4: ; %cmp.true +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(14) ; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v70, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v67, 3 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(12) ; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v69, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v66, 3 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(11) ; GFX11-FAKE16-NEXT: v_add_nc_u16 v5, v64, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 @@ -83114,22 +83265,22 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 ; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:8 -; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:4 ; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:16 -; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:12 ; VI-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:24 -; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:20 ; VI-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:32 -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:28 ; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:40 -; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:36 ; VI-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:48 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:44 ; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:56 -; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:52 ; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:64 -; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:60 ; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:72 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:60 ; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:68 ; VI-NEXT: v_lshlrev_b32_e32 v50, 8, v1 ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v3 @@ -83154,19 +83305,15 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; VI-NEXT: v_lshlrev_b32_e32 v46, 8, v28 ; VI-NEXT: v_lshlrev_b32_e32 v56, 8, v4 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_lshlrev_b32_e32 v58, 8, v6 ; VI-NEXT: v_lshlrev_b32_e32 v59, 8, v8 ; VI-NEXT: v_lshlrev_b32_e32 v60, 8, v10 -; VI-NEXT: s_waitcnt vmcnt(13) ; VI-NEXT: v_lshlrev_b32_e32 v61, 8, v12 -; VI-NEXT: s_waitcnt vmcnt(11) +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_lshlrev_b32_e32 v62, 8, v14 -; VI-NEXT: s_waitcnt vmcnt(9) ; VI-NEXT: v_lshlrev_b32_e32 v63, 8, v22 -; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_lshlrev_b32_e32 v33, 8, v33 -; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: s_waitcnt vmcnt(13) ; VI-NEXT: v_lshlrev_b32_e32 v22, 8, v24 ; VI-NEXT: s_cbranch_scc0 .LBB107_4 ; VI-NEXT: ; %bb.1: ; %cmp.false @@ -83189,19 +83336,27 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; VI-NEXT: v_or_b32_sdwa v1, v38, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v31, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(12) ; VI-NEXT: v_or_b32_sdwa v1, v52, v56 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(11) ; VI-NEXT: v_or_b32_sdwa v0, v54, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(10) ; VI-NEXT: v_or_b32_sdwa v1, v41, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(9) ; VI-NEXT: v_or_b32_sdwa v0, v42, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(8) ; VI-NEXT: v_or_b32_sdwa v1, v43, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_or_b32_sdwa v0, v44, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_or_b32_sdwa v1, v45, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s28, 0xff ; VI-NEXT: s_lshl_b32 s5, s29, 8 ; VI-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_or_b32_sdwa v0, v47, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_or_b32_sdwa v1, v57, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -83248,6 +83403,7 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; VI-NEXT: s_cbranch_execnz .LBB107_3 ; VI-NEXT: .LBB107_2: ; %cmp.true ; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(8) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v44 ; VI-NEXT: v_or_b32_sdwa v3, v62, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v14, vcc, 0x300, v3 @@ -83454,22 +83610,22 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:4 ; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:16 -; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:12 ; GFX9-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:24 -; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:20 ; GFX9-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:32 -; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:28 ; GFX9-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:40 -; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:36 ; GFX9-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:48 -; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:44 ; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:56 -; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:52 ; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:64 -; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:60 ; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:72 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:52 +; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:60 ; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:68 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX9-NEXT: v_lshlrev_b32_e32 v50, 8, v3 @@ -83495,22 +83651,22 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; GFX9-NEXT: v_lshlrev_b32_e32 v47, 8, v2 ; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b32_e32 v29, 8, v4 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX9-NEXT: s_waitcnt vmcnt(18) +; GFX9-NEXT: s_waitcnt vmcnt(19) ; GFX9-NEXT: v_lshlrev_b32_e32 v59, 8, v6 -; GFX9-NEXT: s_waitcnt vmcnt(16) +; GFX9-NEXT: s_waitcnt vmcnt(18) ; GFX9-NEXT: v_lshlrev_b32_e32 v58, 8, v8 -; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: s_waitcnt vmcnt(17) ; GFX9-NEXT: v_lshlrev_b32_e32 v61, 8, v10 -; GFX9-NEXT: s_waitcnt vmcnt(12) +; GFX9-NEXT: s_waitcnt vmcnt(16) ; GFX9-NEXT: v_lshlrev_b32_e32 v60, 8, v12 -; GFX9-NEXT: s_waitcnt vmcnt(10) +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_waitcnt vmcnt(15) ; GFX9-NEXT: v_lshlrev_b32_e32 v63, 8, v14 -; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: s_waitcnt vmcnt(14) ; GFX9-NEXT: v_lshlrev_b32_e32 v62, 8, v36 -; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: s_waitcnt vmcnt(13) ; GFX9-NEXT: v_lshlrev_b32_e32 v38, 8, v38 -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: s_waitcnt vmcnt(12) ; GFX9-NEXT: v_lshlrev_b32_e32 v36, 8, v31 ; GFX9-NEXT: s_cbranch_scc0 .LBB107_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false @@ -83551,38 +83707,46 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; GFX9-NEXT: s_and_b32 s4, s16, 0xff ; GFX9-NEXT: s_lshl_b32 s5, s17, 8 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(11) ; GFX9-NEXT: v_or_b32_sdwa v1, v52, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s4, s4, s5 ; GFX9-NEXT: s_and_b32 s5, s18, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s19, 8 ; GFX9-NEXT: v_lshl_or_b32 v11, v1, 16, v0 +; GFX9-NEXT: s_waitcnt vmcnt(10) ; GFX9-NEXT: v_or_b32_sdwa v0, v41, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s5, s5, s6 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(9) ; GFX9-NEXT: v_or_b32_sdwa v1, v40, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 ; GFX9-NEXT: s_and_b32 s5, s20, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s21, 8 ; GFX9-NEXT: v_lshl_or_b32 v12, v1, 16, v0 +; GFX9-NEXT: s_waitcnt vmcnt(8) ; GFX9-NEXT: v_or_b32_sdwa v0, v44, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s5, s5, s6 ; GFX9-NEXT: s_and_b32 s6, s22, 0xff ; GFX9-NEXT: s_lshl_b32 s7, s23, 8 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: v_or_b32_sdwa v1, v43, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s6, s6, s7 ; GFX9-NEXT: v_lshl_or_b32 v13, v1, 16, v0 +; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_or_b32_sdwa v0, v46, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s6 ; GFX9-NEXT: s_and_b32 s6, s24, 0xff ; GFX9-NEXT: s_lshl_b32 s7, s25, 8 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_or_b32_sdwa v1, v45, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s6, s6, s7 ; GFX9-NEXT: s_and_b32 s7, s26, 0xff ; GFX9-NEXT: s_lshl_b32 s8, s27, 8 ; GFX9-NEXT: v_or_b32_sdwa v2, v26, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v14, v1, 16, v0 +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_or_b32_sdwa v0, v57, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s7, s7, s8 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -83597,6 +83761,7 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: s_cbranch_execnz .LBB107_3 ; GFX9-NEXT: .LBB107_2: ; %cmp.true +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_add_u32_e32 v3, 3, v45 ; GFX9-NEXT: v_or_b32_sdwa v3, v62, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v14, 0x300, v3 @@ -83683,6 +83848,7 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; GFX9-NEXT: s_and_b32 s9, s16, 0xff ; GFX9-NEXT: s_lshl_b32 s10, s17, 8 ; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v57 ; GFX9-NEXT: v_add_u32_e32 v2, 3, v46 ; GFX9-NEXT: s_or_b32 s9, s10, s9 @@ -83778,8 +83944,6 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v10 :: v_dual_mov_b32 v35, v0 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, v4 :: v_dual_mov_b32 v37, v2 ; GFX11-TRUE16-NEXT: s_clause 0xf -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:56 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:52 ; GFX11-TRUE16-NEXT: scratch_load_b32 v2, off, s32 offset:60 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v4, off, s32 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v6, off, s32 offset:8 @@ -83788,6 +83952,8 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v12, off, s32 offset:32 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v14, off, s32 offset:40 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:52 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:44 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:36 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:28 @@ -83811,23 +83977,23 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v29 ; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v85, 8, v0 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(13) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(14) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v66, 8, v4 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(13) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 8, v6 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(10) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v70, 8, v8 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(9) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v71, 8, v10 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(10) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v83, 8, v12 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(9) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 8, v14 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v84, 8, v84 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v85, 8, v0 ; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB107_4 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false @@ -83970,6 +84136,7 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300 ; GFX11-TRUE16-NEXT: s_addk_i32 s10, 0x300 ; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6) ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v64 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10 @@ -84104,8 +84271,6 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, v10 :: v_dual_mov_b32 v35, v0 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v38, v4 :: v_dual_mov_b32 v37, v2 ; GFX11-FAKE16-NEXT: s_clause 0xf -; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:56 -; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:52 ; GFX11-FAKE16-NEXT: scratch_load_b32 v2, off, s32 offset:60 ; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 ; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:8 @@ -84114,6 +84279,8 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:32 ; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:40 ; GFX11-FAKE16-NEXT: scratch_load_u16 v84, off, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:52 ; GFX11-FAKE16-NEXT: scratch_load_u16 v82, off, s32 offset:44 ; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:36 ; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:28 @@ -84137,23 +84304,23 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 8, v29 ; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(15) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v85, 8, v0 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(13) ; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(12) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(14) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v66, 8, v4 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(11) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(13) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 8, v6 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(10) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(12) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v70, 8, v8 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(9) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(11) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v71, 8, v10 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(8) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(10) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v83, 8, v12 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(7) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(9) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v81, 8, v14 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(6) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(8) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v84, 8, v84 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(7) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v85, 8, v0 ; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo ; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB107_4 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false @@ -90408,6 +90575,8 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:28 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:132 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 @@ -90428,7 +90597,6 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:100 ; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:92 ; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:28 ; SI-NEXT: v_lshlrev_b32_e32 v63, 8, v13 ; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v21 ; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v27 @@ -90452,39 +90620,37 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:20 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 ; SI-NEXT: v_lshlrev_b32_e32 v22, 24, v4 ; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v12 +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_lshlrev_b32_e32 v18, 24, v17 -; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v20 -; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v24 -; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v28 -; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_lshlrev_b32_e32 v57, 8, v31 -; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_lshlrev_b32_e32 v46, 24, v32 -; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_lshlrev_b32_e32 v58, 24, v33 -; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_lshlrev_b32_e32 v35, 8, v34 -; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_lshlrev_b32_e32 v61, 24, v36 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:20 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 @@ -92045,18 +92211,6 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_clause 0x1f -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:128 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:124 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:120 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:116 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v53, off, s32 offset:112 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:108 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:104 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:100 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v52, off, s32 offset:96 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:92 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v54, off, s32 offset:88 -; GFX11-TRUE16-NEXT: scratch_load_b32 v66, off, s32 offset:132 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v39, off, s32 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:8 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v48, off, s32 offset:16 @@ -92068,6 +92222,18 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:64 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:72 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:80 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v54, off, s32 offset:88 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v52, off, s32 offset:96 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:104 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v53, off, s32 offset:112 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:120 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:128 +; GFX11-TRUE16-NEXT: scratch_load_b32 v66, off, s32 offset:132 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:124 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:116 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:108 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:100 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:92 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:84 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:76 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:68 @@ -92083,7 +92249,7 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v29.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v27.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v25.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(21) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v23.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v21.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v19.l @@ -92116,27 +92282,30 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.l, 8, v29.h ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.h, 8, v55.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v55.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v53.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v53.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v52.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(14) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v54.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.h, 8, v39.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.h, 8, v48.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.h, 8, v48.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v50.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.h, 8, v50.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(28) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v51.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(21) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v52.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.l, 8, v54.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v64.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v64.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v65.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v54.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v52.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(19) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v53.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v53.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(17) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v55.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.h, 8, v55.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16) ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v66 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB110_3 @@ -92421,17 +92590,6 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v38, v6 :: v_dual_mov_b32 v33, v4 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v37, v2 :: v_dual_mov_b32 v36, v0 ; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:128 -; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:124 -; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:120 -; GFX11-FAKE16-NEXT: scratch_load_u16 v70, off, s32 offset:116 -; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:112 -; GFX11-FAKE16-NEXT: scratch_load_u16 v66, off, s32 offset:108 -; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:104 -; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:100 -; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:96 -; GFX11-FAKE16-NEXT: scratch_load_u16 v64, off, s32 offset:92 -; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:88 ; GFX11-FAKE16-NEXT: scratch_load_b32 v12, off, s32 offset:132 ; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 ; GFX11-FAKE16-NEXT: scratch_load_u16 v96, off, s32 offset:8 @@ -92444,6 +92602,17 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_load_u16 v115, off, s32 offset:64 ; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:72 ; GFX11-FAKE16-NEXT: scratch_load_u16 v117, off, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:88 +; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:96 +; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:104 +; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:112 +; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:120 +; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:128 +; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:124 +; GFX11-FAKE16-NEXT: scratch_load_u16 v70, off, s32 offset:116 +; GFX11-FAKE16-NEXT: scratch_load_u16 v66, off, s32 offset:108 +; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:100 +; GFX11-FAKE16-NEXT: scratch_load_u16 v64, off, s32 offset:92 ; GFX11-FAKE16-NEXT: scratch_load_u16 v99, off, s32 offset:84 ; GFX11-FAKE16-NEXT: scratch_load_u16 v85, off, s32 offset:76 ; GFX11-FAKE16-NEXT: scratch_load_u16 v97, off, s32 offset:68 @@ -92472,37 +92641,41 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v19, 8, v27 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v27, 8, v29 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(33) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v119, 8, v0 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(31) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v118, 8, v2 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(22) ; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(21) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(32) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v23, 8, v14 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(20) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(31) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v98, 8, v96 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(19) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(30) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v25, 8, v100 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(18) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(29) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v96, 8, v101 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(17) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(28) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v29, 8, v102 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(16) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(27) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v112, 8, v103 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(15) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(26) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v103, 8, v113 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(14) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(25) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v101, 8, v114 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(13) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(24) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v100, 8, v115 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(12) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(23) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v114, 8, v116 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(11) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(22) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v102, 8, v117 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(21) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v117, 8, v10 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(20) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v113, 8, v8 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(19) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v116, 8, v6 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(18) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v115, 8, v4 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(17) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v118, 8, v2 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(16) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v119, 8, v0 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -92664,10 +92837,13 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB110_2 ; GFX11-FAKE16-NEXT: .LBB110_4: ; %cmp.true +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(14) ; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v70, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v67, 3 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(12) ; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v69, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v66, 3 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(11) ; GFX11-FAKE16-NEXT: v_add_nc_u16 v5, v64, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 @@ -92849,21 +93025,20 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a, ; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:40 ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:56 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:72 ; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:52 ; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:48 ; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:24 ; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:16 ; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:40 -; SI-NEXT: s_waitcnt expcnt(5) ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:36 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:72 ; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:68 ; SI-NEXT: v_readfirstlane_b32 s46, v30 ; SI-NEXT: v_readfirstlane_b32 s44, v23 @@ -92890,14 +93065,14 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a, ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v36 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_lshlrev_b32_e32 v44, 24, v37 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_lshlrev_b32_e32 v41, 24, v38 -; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_lshlrev_b32_e32 v30, 24, v39 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_lshlrev_b32_e32 v41, 24, v38 +; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_lshlrev_b32_e32 v42, 24, v48 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_lshlrev_b32_e32 v44, 24, v37 +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_lshlrev_b32_e32 v45, 24, v45 ; SI-NEXT: s_cbranch_scc0 .LBB111_3 ; SI-NEXT: ; %bb.1: ; %cmp.false @@ -92945,6 +93120,7 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a, ; SI-NEXT: v_and_b32_e32 v17, 0xff, v18 ; SI-NEXT: s_and_b32 s4, s45, 0xff ; SI-NEXT: s_lshl_b32 s5, s44, 8 +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_and_b32_e32 v25, 0xff, v52 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v3 @@ -92961,6 +93137,7 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a, ; SI-NEXT: s_and_b32 s4, s46, 0xff ; SI-NEXT: s_lshl_b32 s5, s47, 8 ; SI-NEXT: v_or_b32_e32 v32, v29, v25 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v29, 0xff, v40 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 @@ -93353,22 +93530,22 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a, ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 ; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:8 -; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:4 ; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:16 -; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:12 ; VI-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:24 -; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:20 ; VI-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:32 -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:28 ; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:40 -; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:36 ; VI-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:48 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:44 ; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:56 -; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:52 ; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:64 -; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:60 ; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:72 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:60 ; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:68 ; VI-NEXT: v_lshlrev_b32_e32 v50, 8, v1 ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v3 @@ -93393,19 +93570,15 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a, ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; VI-NEXT: v_lshlrev_b32_e32 v46, 8, v28 ; VI-NEXT: v_lshlrev_b32_e32 v56, 8, v4 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_lshlrev_b32_e32 v58, 8, v6 ; VI-NEXT: v_lshlrev_b32_e32 v59, 8, v8 ; VI-NEXT: v_lshlrev_b32_e32 v60, 8, v10 -; VI-NEXT: s_waitcnt vmcnt(13) ; VI-NEXT: v_lshlrev_b32_e32 v61, 8, v12 -; VI-NEXT: s_waitcnt vmcnt(11) +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_lshlrev_b32_e32 v62, 8, v14 -; VI-NEXT: s_waitcnt vmcnt(9) ; VI-NEXT: v_lshlrev_b32_e32 v63, 8, v22 -; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_lshlrev_b32_e32 v33, 8, v33 -; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: s_waitcnt vmcnt(13) ; VI-NEXT: v_lshlrev_b32_e32 v22, 8, v24 ; VI-NEXT: s_cbranch_scc0 .LBB111_4 ; VI-NEXT: ; %bb.1: ; %cmp.false @@ -93428,19 +93601,27 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a, ; VI-NEXT: v_or_b32_sdwa v1, v38, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v31, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(12) ; VI-NEXT: v_or_b32_sdwa v1, v52, v56 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(11) ; VI-NEXT: v_or_b32_sdwa v0, v54, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(10) ; VI-NEXT: v_or_b32_sdwa v1, v41, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(9) ; VI-NEXT: v_or_b32_sdwa v0, v42, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(8) ; VI-NEXT: v_or_b32_sdwa v1, v43, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_or_b32_sdwa v0, v44, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_or_b32_sdwa v1, v45, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s28, 0xff ; VI-NEXT: s_lshl_b32 s5, s29, 8 ; VI-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_or_b32_sdwa v0, v47, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_or_b32_sdwa v1, v57, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -93487,6 +93668,7 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a, ; VI-NEXT: s_cbranch_execnz .LBB111_3 ; VI-NEXT: .LBB111_2: ; %cmp.true ; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(8) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v44 ; VI-NEXT: v_or_b32_sdwa v3, v62, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v14, vcc, 0x300, v3 @@ -93693,22 +93875,22 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a, ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:4 ; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:16 -; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:12 ; GFX9-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:24 -; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:20 ; GFX9-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:32 -; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:28 ; GFX9-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:40 -; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:36 ; GFX9-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:48 -; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:44 ; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:56 -; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:52 ; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:64 -; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:60 ; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:72 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:52 +; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:60 ; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:68 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX9-NEXT: v_lshlrev_b32_e32 v50, 8, v3 @@ -93734,22 +93916,22 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a, ; GFX9-NEXT: v_lshlrev_b32_e32 v47, 8, v2 ; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b32_e32 v29, 8, v4 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX9-NEXT: s_waitcnt vmcnt(18) +; GFX9-NEXT: s_waitcnt vmcnt(19) ; GFX9-NEXT: v_lshlrev_b32_e32 v59, 8, v6 -; GFX9-NEXT: s_waitcnt vmcnt(16) +; GFX9-NEXT: s_waitcnt vmcnt(18) ; GFX9-NEXT: v_lshlrev_b32_e32 v58, 8, v8 -; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: s_waitcnt vmcnt(17) ; GFX9-NEXT: v_lshlrev_b32_e32 v61, 8, v10 -; GFX9-NEXT: s_waitcnt vmcnt(12) +; GFX9-NEXT: s_waitcnt vmcnt(16) ; GFX9-NEXT: v_lshlrev_b32_e32 v60, 8, v12 -; GFX9-NEXT: s_waitcnt vmcnt(10) +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_waitcnt vmcnt(15) ; GFX9-NEXT: v_lshlrev_b32_e32 v63, 8, v14 -; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: s_waitcnt vmcnt(14) ; GFX9-NEXT: v_lshlrev_b32_e32 v62, 8, v36 -; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: s_waitcnt vmcnt(13) ; GFX9-NEXT: v_lshlrev_b32_e32 v38, 8, v38 -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: s_waitcnt vmcnt(12) ; GFX9-NEXT: v_lshlrev_b32_e32 v36, 8, v31 ; GFX9-NEXT: s_cbranch_scc0 .LBB111_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false @@ -93790,38 +93972,46 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a, ; GFX9-NEXT: s_and_b32 s4, s16, 0xff ; GFX9-NEXT: s_lshl_b32 s5, s17, 8 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(11) ; GFX9-NEXT: v_or_b32_sdwa v1, v52, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s4, s4, s5 ; GFX9-NEXT: s_and_b32 s5, s18, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s19, 8 ; GFX9-NEXT: v_lshl_or_b32 v11, v1, 16, v0 +; GFX9-NEXT: s_waitcnt vmcnt(10) ; GFX9-NEXT: v_or_b32_sdwa v0, v41, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s5, s5, s6 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(9) ; GFX9-NEXT: v_or_b32_sdwa v1, v40, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 ; GFX9-NEXT: s_and_b32 s5, s20, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s21, 8 ; GFX9-NEXT: v_lshl_or_b32 v12, v1, 16, v0 +; GFX9-NEXT: s_waitcnt vmcnt(8) ; GFX9-NEXT: v_or_b32_sdwa v0, v44, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s5, s5, s6 ; GFX9-NEXT: s_and_b32 s6, s22, 0xff ; GFX9-NEXT: s_lshl_b32 s7, s23, 8 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: v_or_b32_sdwa v1, v43, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s6, s6, s7 ; GFX9-NEXT: v_lshl_or_b32 v13, v1, 16, v0 +; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_or_b32_sdwa v0, v46, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s6 ; GFX9-NEXT: s_and_b32 s6, s24, 0xff ; GFX9-NEXT: s_lshl_b32 s7, s25, 8 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_or_b32_sdwa v1, v45, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s6, s6, s7 ; GFX9-NEXT: s_and_b32 s7, s26, 0xff ; GFX9-NEXT: s_lshl_b32 s8, s27, 8 ; GFX9-NEXT: v_or_b32_sdwa v2, v26, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v14, v1, 16, v0 +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_or_b32_sdwa v0, v57, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s7, s7, s8 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -93836,6 +94026,7 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: s_cbranch_execnz .LBB111_3 ; GFX9-NEXT: .LBB111_2: ; %cmp.true +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_add_u32_e32 v3, 3, v45 ; GFX9-NEXT: v_or_b32_sdwa v3, v62, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v14, 0x300, v3 @@ -93922,6 +94113,7 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a, ; GFX9-NEXT: s_and_b32 s9, s16, 0xff ; GFX9-NEXT: s_lshl_b32 s10, s17, 8 ; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v57 ; GFX9-NEXT: v_add_u32_e32 v2, 3, v46 ; GFX9-NEXT: s_or_b32 s9, s10, s9 @@ -94017,8 +94209,6 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a, ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v10 :: v_dual_mov_b32 v35, v0 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, v4 :: v_dual_mov_b32 v37, v2 ; GFX11-TRUE16-NEXT: s_clause 0xf -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:56 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:52 ; GFX11-TRUE16-NEXT: scratch_load_b32 v2, off, s32 offset:60 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v4, off, s32 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v6, off, s32 offset:8 @@ -94027,6 +94217,8 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a, ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v12, off, s32 offset:32 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v14, off, s32 offset:40 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:52 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:44 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:36 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:28 @@ -94050,23 +94242,23 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a, ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v29 ; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v85, 8, v0 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(13) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(14) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v66, 8, v4 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(13) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 8, v6 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(10) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v70, 8, v8 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(9) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v71, 8, v10 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(10) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v83, 8, v12 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(9) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 8, v14 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v84, 8, v84 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v85, 8, v0 ; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB111_4 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false @@ -94209,6 +94401,7 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a, ; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300 ; GFX11-TRUE16-NEXT: s_addk_i32 s10, 0x300 ; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6) ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v64 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10 @@ -94343,8 +94536,6 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a, ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, v10 :: v_dual_mov_b32 v35, v0 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v38, v4 :: v_dual_mov_b32 v37, v2 ; GFX11-FAKE16-NEXT: s_clause 0xf -; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:56 -; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:52 ; GFX11-FAKE16-NEXT: scratch_load_b32 v2, off, s32 offset:60 ; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 ; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:8 @@ -94353,6 +94544,8 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a, ; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:32 ; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:40 ; GFX11-FAKE16-NEXT: scratch_load_u16 v84, off, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:52 ; GFX11-FAKE16-NEXT: scratch_load_u16 v82, off, s32 offset:44 ; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:36 ; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:28 @@ -94376,23 +94569,23 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a, ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 8, v29 ; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(15) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v85, 8, v0 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(13) ; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(12) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(14) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v66, 8, v4 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(11) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(13) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 8, v6 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(10) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(12) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v70, 8, v8 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(9) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(11) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v71, 8, v10 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(8) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(10) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v83, 8, v12 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(7) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(9) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v81, 8, v14 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(6) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(8) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v84, 8, v84 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(7) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v85, 8, v0 ; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo ; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB111_4 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll index 5d4df4bde1af8..f0c32e8b2b7a3 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll @@ -3418,10 +3418,10 @@ define <18 x i32> @bitcast_v36i16_to_v18i32(<36 x i16> %a, i32 %b) { ; SI-NEXT: v_mov_b32_e32 v53, v4 ; SI-NEXT: v_mov_b32_e32 v54, v2 ; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; SI-NEXT: v_mov_b32_e32 v36, v22 @@ -3449,13 +3449,13 @@ define <18 x i32> @bitcast_v36i16_to_v18i32(<36 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v29 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v0 -; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v4 -; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v4 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -10551,10 +10551,10 @@ define <18 x float> @bitcast_v36i16_to_v18f32(<36 x i16> %a, i32 %b) { ; SI-NEXT: v_mov_b32_e32 v53, v4 ; SI-NEXT: v_mov_b32_e32 v54, v2 ; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; SI-NEXT: v_mov_b32_e32 v36, v22 @@ -10582,13 +10582,13 @@ define <18 x float> @bitcast_v36i16_to_v18f32(<36 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v29 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v0 -; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v4 -; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v4 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -16900,10 +16900,10 @@ define <9 x i64> @bitcast_v36i16_to_v9i64(<36 x i16> %a, i32 %b) { ; SI-NEXT: v_mov_b32_e32 v53, v4 ; SI-NEXT: v_mov_b32_e32 v54, v2 ; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; SI-NEXT: v_mov_b32_e32 v36, v22 @@ -16931,13 +16931,13 @@ define <9 x i64> @bitcast_v36i16_to_v9i64(<36 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v29 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v0 -; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v4 -; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v4 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -22476,10 +22476,10 @@ define <9 x double> @bitcast_v36i16_to_v9f64(<36 x i16> %a, i32 %b) { ; SI-NEXT: v_mov_b32_e32 v53, v4 ; SI-NEXT: v_mov_b32_e32 v54, v2 ; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; SI-NEXT: v_mov_b32_e32 v36, v22 @@ -22507,13 +22507,13 @@ define <9 x double> @bitcast_v36i16_to_v9f64(<36 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v29 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v0 -; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v4 -; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v4 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -26814,12 +26814,12 @@ define <36 x half> @bitcast_v36i16_to_v36f16(<36 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:24 ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:16 ; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:24 ; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 ; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; kill: killed $vgpr48 @@ -26865,7 +26865,7 @@ define <36 x half> @bitcast_v36i16_to_v36f16(<36 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; kill: killed $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll index 44cfd6c28ca6a..fd392b702568e 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll @@ -3566,10 +3566,10 @@ define <20 x i32> @bitcast_v40i16_to_v20i32(<40 x i16> %a, i32 %b) { ; SI-NEXT: v_mov_b32_e32 v53, v4 ; SI-NEXT: v_mov_b32_e32 v54, v2 ; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 ; SI-NEXT: v_mov_b32_e32 v49, v12 ; SI-NEXT: v_mov_b32_e32 v50, v10 ; SI-NEXT: v_mov_b32_e32 v51, v8 @@ -3595,28 +3595,28 @@ define <20 x i32> @bitcast_v40i16_to_v20i32(<40 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v29 ; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v0 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v4 -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v4 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:12 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v6 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 @@ -11765,10 +11765,10 @@ define <20 x float> @bitcast_v40i16_to_v20f32(<40 x i16> %a, i32 %b) { ; SI-NEXT: v_mov_b32_e32 v53, v4 ; SI-NEXT: v_mov_b32_e32 v54, v2 ; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 ; SI-NEXT: v_mov_b32_e32 v49, v12 ; SI-NEXT: v_mov_b32_e32 v50, v10 ; SI-NEXT: v_mov_b32_e32 v51, v8 @@ -11794,28 +11794,28 @@ define <20 x float> @bitcast_v40i16_to_v20f32(<40 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v29 ; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v0 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v4 -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v4 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:12 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v6 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 @@ -19274,10 +19274,10 @@ define <10 x i64> @bitcast_v40i16_to_v10i64(<40 x i16> %a, i32 %b) { ; SI-NEXT: v_mov_b32_e32 v53, v4 ; SI-NEXT: v_mov_b32_e32 v54, v2 ; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 ; SI-NEXT: v_mov_b32_e32 v49, v12 ; SI-NEXT: v_mov_b32_e32 v50, v10 ; SI-NEXT: v_mov_b32_e32 v51, v8 @@ -19303,28 +19303,28 @@ define <10 x i64> @bitcast_v40i16_to_v10i64(<40 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v29 ; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v0 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v4 -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v4 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:12 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v6 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 @@ -26013,10 +26013,10 @@ define <10 x double> @bitcast_v40i16_to_v10f64(<40 x i16> %a, i32 %b) { ; SI-NEXT: v_mov_b32_e32 v53, v4 ; SI-NEXT: v_mov_b32_e32 v54, v2 ; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 ; SI-NEXT: v_mov_b32_e32 v49, v12 ; SI-NEXT: v_mov_b32_e32 v50, v10 ; SI-NEXT: v_mov_b32_e32 v51, v8 @@ -26042,28 +26042,28 @@ define <10 x double> @bitcast_v40i16_to_v10f64(<40 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v29 ; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v0 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v4 -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v4 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:12 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v6 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 @@ -31405,9 +31405,9 @@ define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40 ; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40 ; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:36 ; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:32 ; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:28 @@ -31472,7 +31472,7 @@ define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; kill: killed $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -31618,6 +31618,7 @@ define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v26 ; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_add_i32_e32 v38, vcc, 3, v38 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll index 87d5157b3c340..01625cd53ef68 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll @@ -3816,8 +3816,8 @@ define <22 x i32> @bitcast_v44i16_to_v22i32(<44 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: v_mov_b32_e32 v54, v2 ; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16 ; SI-NEXT: v_mov_b32_e32 v53, v4 ; SI-NEXT: v_mov_b32_e32 v50, v10 ; SI-NEXT: v_mov_b32_e32 v51, v8 @@ -3843,44 +3843,44 @@ define <22 x i32> @bitcast_v44i16_to_v22i32(<44 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v29 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v0 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:4 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:52 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:52 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v2 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v4 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:36 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:24 ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v8 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v12 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v10 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -12779,8 +12779,8 @@ define <22 x float> @bitcast_v44i16_to_v22f32(<44 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: v_mov_b32_e32 v54, v2 ; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16 ; SI-NEXT: v_mov_b32_e32 v53, v4 ; SI-NEXT: v_mov_b32_e32 v50, v10 ; SI-NEXT: v_mov_b32_e32 v51, v8 @@ -12806,44 +12806,44 @@ define <22 x float> @bitcast_v44i16_to_v22f32(<44 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v29 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v0 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:4 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:52 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:52 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v2 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v4 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:36 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:24 ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v8 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v12 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v10 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -21028,8 +21028,8 @@ define <11 x i64> @bitcast_v44i16_to_v11i64(<44 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: v_mov_b32_e32 v54, v2 ; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16 ; SI-NEXT: v_mov_b32_e32 v53, v4 ; SI-NEXT: v_mov_b32_e32 v50, v10 ; SI-NEXT: v_mov_b32_e32 v51, v8 @@ -21055,44 +21055,44 @@ define <11 x i64> @bitcast_v44i16_to_v11i64(<44 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v29 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v0 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:4 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:52 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:52 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v2 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v4 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:36 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:24 ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v8 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v12 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v10 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -28444,8 +28444,8 @@ define <11 x double> @bitcast_v44i16_to_v11f64(<44 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: v_mov_b32_e32 v54, v2 ; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16 ; SI-NEXT: v_mov_b32_e32 v53, v4 ; SI-NEXT: v_mov_b32_e32 v50, v10 ; SI-NEXT: v_mov_b32_e32 v51, v8 @@ -28471,44 +28471,44 @@ define <11 x double> @bitcast_v44i16_to_v11f64(<44 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v29 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v0 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:4 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:52 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:52 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v2 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v4 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:36 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:24 ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v8 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v12 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v10 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -34405,13 +34405,13 @@ define <44 x half> @bitcast_v44i16_to_v44f16(<44 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:56 ; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:16 ; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:56 ; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:52 ; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:44 @@ -34488,7 +34488,7 @@ define <44 x half> @bitcast_v44i16_to_v44f16(<44 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; kill: killed $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -34668,6 +34668,7 @@ define <44 x half> @bitcast_v44i16_to_v44f16(<44 x i16> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v24 ; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_add_i32_e32 v50, vcc, 3, v50 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll index fb2e94fc3b87a..2dc27719b5977 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll @@ -4077,14 +4077,14 @@ define <24 x i32> @bitcast_v48i16_to_v24i32(<48 x i16> %a, i32 %b) { ; SI-NEXT: v_mov_b32_e32 v53, v4 ; SI-NEXT: v_mov_b32_e32 v54, v2 ; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60 ; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v3 ; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v5 @@ -4101,44 +4101,45 @@ define <24 x i32> @bitcast_v48i16_to_v24i32(<48 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v29 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v0 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v6 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v2 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:52 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:44 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v14 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v16 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:36 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v16 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v18 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v18 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -13914,14 +13915,14 @@ define <24 x float> @bitcast_v48i16_to_v24f32(<48 x i16> %a, i32 %b) { ; SI-NEXT: v_mov_b32_e32 v53, v4 ; SI-NEXT: v_mov_b32_e32 v54, v2 ; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60 ; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v3 ; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v5 @@ -13938,44 +13939,45 @@ define <24 x float> @bitcast_v48i16_to_v24f32(<48 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v29 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v0 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v6 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v2 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:52 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:44 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v14 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v16 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:36 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v16 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v18 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v18 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -23014,14 +23016,14 @@ define <12 x i64> @bitcast_v48i16_to_v12i64(<48 x i16> %a, i32 %b) { ; SI-NEXT: v_mov_b32_e32 v53, v4 ; SI-NEXT: v_mov_b32_e32 v54, v2 ; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60 ; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v3 ; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v5 @@ -23038,44 +23040,45 @@ define <12 x i64> @bitcast_v48i16_to_v12i64(<48 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v29 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v0 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v6 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v2 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:52 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:44 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v14 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v16 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:36 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v16 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v18 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v18 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -31231,14 +31234,14 @@ define <12 x double> @bitcast_v48i16_to_v12f64(<48 x i16> %a, i32 %b) { ; SI-NEXT: v_mov_b32_e32 v53, v4 ; SI-NEXT: v_mov_b32_e32 v54, v2 ; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60 ; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v3 ; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v5 @@ -31255,44 +31258,45 @@ define <12 x double> @bitcast_v48i16_to_v12f64(<48 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v29 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v0 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v6 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v2 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:52 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:44 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v14 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v16 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:36 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v16 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v18 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v18 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -37854,6 +37858,7 @@ define <48 x half> @bitcast_v48i16_to_v48f16(<48 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72 ; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:68 ; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:64 ; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:60 @@ -37864,7 +37869,6 @@ define <48 x half> @bitcast_v48i16_to_v48f16(<48 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:40 ; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:36 ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:32 @@ -37888,7 +37892,7 @@ define <48 x half> @bitcast_v48i16_to_v48f16(<48 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; kill: killed $vgpr31 @@ -37988,6 +37992,7 @@ define <48 x half> @bitcast_v48i16_to_v48f16(<48 x i16> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v33, v16 +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cvt_f32_f16_e32 v31, v40 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 @@ -38053,7 +38058,7 @@ define <48 x half> @bitcast_v48i16_to_v48f16(<48 x i16> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v1, v30 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v54 ; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill @@ -38061,7 +38066,7 @@ define <48 x half> @bitcast_v48i16_to_v48f16(<48 x i16> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v1, v55 ; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v56 ; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill @@ -38167,6 +38172,7 @@ define <48 x half> @bitcast_v48i16_to_v48f16(<48 x i16> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v22 ; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_add_i32_e32 v54, vcc, 3, v54 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll index 07cdbef82d892..5a978534eeb9e 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll @@ -4372,12 +4372,12 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) { ; SI-NEXT: v_mov_b32_e32 v53, v4 ; SI-NEXT: v_mov_b32_e32 v54, v2 ; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16 ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:48 ; SI-NEXT: v_mov_b32_e32 v49, v12 ; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v3 @@ -4395,61 +4395,61 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v29 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v0 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v4 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:4 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:84 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:84 ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v10 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v16 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v12 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v16 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v18 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:60 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:60 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v18 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v20 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:52 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v20 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v22 -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v22 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -5519,10 +5519,10 @@ define inreg <26 x i32> @bitcast_v52i16_to_v26i32_scalar(<52 x i16> inreg %a, i3 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v62, v30 @@ -5556,10 +5556,10 @@ define inreg <26 x i32> @bitcast_v52i16_to_v26i32_scalar(<52 x i16> inreg %a, i3 ; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v2 ; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v6 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v8 ; SI-NEXT: s_cbranch_scc0 .LBB15_4 ; SI-NEXT: ; %bb.1: ; %cmp.false @@ -5607,11 +5607,13 @@ define inreg <26 x i32> @bitcast_v52i16_to_v26i32_scalar(<52 x i16> inreg %a, i3 ; SI-NEXT: s_and_b32 s8, s24, 0xffff ; SI-NEXT: s_lshl_b32 s9, s25, 16 ; SI-NEXT: v_or_b32_e32 v22, v0, v45 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v44 ; SI-NEXT: s_or_b32 s8, s8, s9 ; SI-NEXT: s_and_b32 s9, s26, 0xffff ; SI-NEXT: s_lshl_b32 s10, s27, 16 ; SI-NEXT: v_or_b32_e32 v23, v0, v51 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v43 ; SI-NEXT: s_or_b32 s9, s9, s10 ; SI-NEXT: s_and_b32 s10, s28, 0xffff @@ -5774,7 +5776,7 @@ define inreg <26 x i32> @bitcast_v52i16_to_v26i32_scalar(<52 x i16> inreg %a, i3 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB15_4: ; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(2) expcnt(0) ; SI-NEXT: v_mov_b32_e32 v47, v43 ; SI-NEXT: v_mov_b32_e32 v43, v50 ; SI-NEXT: v_mov_b32_e32 v50, v38 @@ -15107,12 +15109,12 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) { ; SI-NEXT: v_mov_b32_e32 v53, v4 ; SI-NEXT: v_mov_b32_e32 v54, v2 ; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16 ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:48 ; SI-NEXT: v_mov_b32_e32 v49, v12 ; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v3 @@ -15130,61 +15132,61 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v29 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v0 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v4 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:4 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:84 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:84 ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v10 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v16 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v12 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v16 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v18 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:60 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:60 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v18 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v20 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:52 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v20 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v22 -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v22 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -16254,10 +16256,10 @@ define inreg <26 x float> @bitcast_v52i16_to_v26f32_scalar(<52 x i16> inreg %a, ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v62, v30 @@ -16291,10 +16293,10 @@ define inreg <26 x float> @bitcast_v52i16_to_v26f32_scalar(<52 x i16> inreg %a, ; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v2 ; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v6 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v8 ; SI-NEXT: s_cbranch_scc0 .LBB31_4 ; SI-NEXT: ; %bb.1: ; %cmp.false @@ -16342,11 +16344,13 @@ define inreg <26 x float> @bitcast_v52i16_to_v26f32_scalar(<52 x i16> inreg %a, ; SI-NEXT: s_and_b32 s8, s24, 0xffff ; SI-NEXT: s_lshl_b32 s9, s25, 16 ; SI-NEXT: v_or_b32_e32 v22, v0, v45 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v44 ; SI-NEXT: s_or_b32 s8, s8, s9 ; SI-NEXT: s_and_b32 s9, s26, 0xffff ; SI-NEXT: s_lshl_b32 s10, s27, 16 ; SI-NEXT: v_or_b32_e32 v23, v0, v51 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v43 ; SI-NEXT: s_or_b32 s9, s9, s10 ; SI-NEXT: s_and_b32 s10, s28, 0xffff @@ -16509,7 +16513,7 @@ define inreg <26 x float> @bitcast_v52i16_to_v26f32_scalar(<52 x i16> inreg %a, ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB31_4: ; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(2) expcnt(0) ; SI-NEXT: v_mov_b32_e32 v47, v43 ; SI-NEXT: v_mov_b32_e32 v43, v50 ; SI-NEXT: v_mov_b32_e32 v50, v38 @@ -25054,12 +25058,12 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) { ; SI-NEXT: v_mov_b32_e32 v53, v4 ; SI-NEXT: v_mov_b32_e32 v54, v2 ; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16 ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:48 ; SI-NEXT: v_mov_b32_e32 v49, v12 ; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v3 @@ -25077,61 +25081,61 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v29 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v0 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v4 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:4 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:84 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:84 ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v10 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v16 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v12 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v16 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v18 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:60 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:60 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v18 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v20 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:52 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v20 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v22 -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v22 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -26201,10 +26205,10 @@ define inreg <13 x i64> @bitcast_v52i16_to_v13i64_scalar(<52 x i16> inreg %a, i3 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v62, v30 @@ -26238,10 +26242,10 @@ define inreg <13 x i64> @bitcast_v52i16_to_v13i64_scalar(<52 x i16> inreg %a, i3 ; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v2 ; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v6 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v8 ; SI-NEXT: s_cbranch_scc0 .LBB43_4 ; SI-NEXT: ; %bb.1: ; %cmp.false @@ -26289,11 +26293,13 @@ define inreg <13 x i64> @bitcast_v52i16_to_v13i64_scalar(<52 x i16> inreg %a, i3 ; SI-NEXT: s_and_b32 s8, s24, 0xffff ; SI-NEXT: s_lshl_b32 s9, s25, 16 ; SI-NEXT: v_or_b32_e32 v22, v0, v45 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v44 ; SI-NEXT: s_or_b32 s8, s8, s9 ; SI-NEXT: s_and_b32 s9, s26, 0xffff ; SI-NEXT: s_lshl_b32 s10, s27, 16 ; SI-NEXT: v_or_b32_e32 v23, v0, v51 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v43 ; SI-NEXT: s_or_b32 s9, s9, s10 ; SI-NEXT: s_and_b32 s10, s28, 0xffff @@ -26456,7 +26462,7 @@ define inreg <13 x i64> @bitcast_v52i16_to_v13i64_scalar(<52 x i16> inreg %a, i3 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB43_4: ; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(2) expcnt(0) ; SI-NEXT: v_mov_b32_e32 v47, v43 ; SI-NEXT: v_mov_b32_e32 v43, v50 ; SI-NEXT: v_mov_b32_e32 v50, v38 @@ -34084,12 +34090,12 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) { ; SI-NEXT: v_mov_b32_e32 v53, v4 ; SI-NEXT: v_mov_b32_e32 v54, v2 ; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16 ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:48 ; SI-NEXT: v_mov_b32_e32 v49, v12 ; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v3 @@ -34107,61 +34113,61 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v29 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v0 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v4 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:4 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:84 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:84 ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v10 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v16 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v12 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v16 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v18 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:60 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:60 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v18 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v20 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:52 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v20 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v22 -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v22 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -35231,10 +35237,10 @@ define inreg <13 x double> @bitcast_v52i16_to_v13f64_scalar(<52 x i16> inreg %a, ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v62, v30 @@ -35268,10 +35274,10 @@ define inreg <13 x double> @bitcast_v52i16_to_v13f64_scalar(<52 x i16> inreg %a, ; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v2 ; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v6 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v8 ; SI-NEXT: s_cbranch_scc0 .LBB51_4 ; SI-NEXT: ; %bb.1: ; %cmp.false @@ -35319,11 +35325,13 @@ define inreg <13 x double> @bitcast_v52i16_to_v13f64_scalar(<52 x i16> inreg %a, ; SI-NEXT: s_and_b32 s8, s24, 0xffff ; SI-NEXT: s_lshl_b32 s9, s25, 16 ; SI-NEXT: v_or_b32_e32 v22, v0, v45 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v44 ; SI-NEXT: s_or_b32 s8, s8, s9 ; SI-NEXT: s_and_b32 s9, s26, 0xffff ; SI-NEXT: s_lshl_b32 s10, s27, 16 ; SI-NEXT: v_or_b32_e32 v23, v0, v51 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v43 ; SI-NEXT: s_or_b32 s9, s9, s10 ; SI-NEXT: s_and_b32 s10, s28, 0xffff @@ -35486,7 +35494,7 @@ define inreg <13 x double> @bitcast_v52i16_to_v13f64_scalar(<52 x i16> inreg %a, ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB51_4: ; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(2) expcnt(0) ; SI-NEXT: v_mov_b32_e32 v47, v43 ; SI-NEXT: v_mov_b32_e32 v43, v50 ; SI-NEXT: v_mov_b32_e32 v50, v38 @@ -41338,6 +41346,7 @@ define <52 x half> @bitcast_v52i16_to_v52f16(<52 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:88 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:84 ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:80 @@ -41353,7 +41362,6 @@ define <52 x half> @bitcast_v52i16_to_v52f16(<52 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:88 ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:52 ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:48 ; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:44 @@ -41372,7 +41380,7 @@ define <52 x half> @bitcast_v52i16_to_v52f16(<52 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v39 ; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; kill: killed $vgpr39 @@ -41561,7 +41569,7 @@ define <52 x half> @bitcast_v52i16_to_v52f16(<52 x i16> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v1, v30 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v42 ; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill @@ -41585,7 +41593,7 @@ define <52 x half> @bitcast_v52i16_to_v52f16(<52 x i16> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v1, v47 ; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v31 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill @@ -41701,6 +41709,7 @@ define <52 x half> @bitcast_v52i16_to_v52f16(<52 x i16> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v20 ; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_add_i32_e32 v42, vcc, 3, v42 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -42652,9 +42661,9 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:28 ; SI-NEXT: s_waitcnt expcnt(5) ; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:16 ; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll index 8eb71e90f8504..a1e41c91784b2 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll @@ -4696,8 +4696,8 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; SI-NEXT: v_mov_b32_e32 v54, v2 ; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 ; SI-NEXT: v_mov_b32_e32 v53, v4 ; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v3 @@ -4716,70 +4716,70 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v29 ; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v0 -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:84 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:84 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:76 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:76 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v2 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v4 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:68 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:68 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v4 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:100 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:16 ; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:32 ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:60 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v6 ; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:48 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:56 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v18 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v14 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v16 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v18 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:52 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v22 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v24 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v26 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v26 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v22 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -5945,14 +5945,14 @@ define inreg <28 x i32> @bitcast_v56i16_to_v28i32_scalar(<56 x i16> inreg %a, i3 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:36 ; SI-NEXT: v_mov_b32_e32 v31, v26 ; SI-NEXT: v_mov_b32_e32 v41, v24 @@ -5987,14 +5987,13 @@ define inreg <28 x i32> @bitcast_v56i16_to_v28i32_scalar(<56 x i16> inreg %a, i3 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v2 ; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v4 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v12 ; SI-NEXT: s_cbranch_scc0 .LBB15_4 ; SI-NEXT: ; %bb.1: ; %cmp.false @@ -6042,21 +6041,25 @@ define inreg <28 x i32> @bitcast_v56i16_to_v28i32_scalar(<56 x i16> inreg %a, i3 ; SI-NEXT: s_and_b32 s6, s20, 0xffff ; SI-NEXT: s_lshl_b32 s7, s21, 16 ; SI-NEXT: v_or_b32_e32 v22, v0, v54 +; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v56 ; SI-NEXT: s_or_b32 s6, s6, s7 ; SI-NEXT: s_and_b32 s7, s22, 0xffff ; SI-NEXT: s_lshl_b32 s8, s23, 16 ; SI-NEXT: v_or_b32_e32 v23, v0, v32 +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 ; SI-NEXT: s_or_b32 s7, s7, s8 ; SI-NEXT: s_and_b32 s8, s24, 0xffff ; SI-NEXT: s_lshl_b32 s9, s25, 16 ; SI-NEXT: v_or_b32_e32 v24, v0, v59 +; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v47 ; SI-NEXT: s_or_b32 s8, s8, s9 ; SI-NEXT: s_and_b32 s9, s26, 0xffff ; SI-NEXT: s_lshl_b32 s10, s27, 16 ; SI-NEXT: v_or_b32_e32 v25, v0, v53 +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v46 ; SI-NEXT: s_or_b32 s9, s9, s10 ; SI-NEXT: s_and_b32 s10, s28, 0xffff @@ -6238,7 +6241,9 @@ define inreg <28 x i32> @bitcast_v56i16_to_v28i32_scalar(<56 x i16> inreg %a, i3 ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v62, v58 +; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_mov_b32_e32 v58, v51 +; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_mov_b32_e32 v51, v47 ; SI-NEXT: v_mov_b32_e32 v47, v44 ; SI-NEXT: v_mov_b32_e32 v44, v41 @@ -16321,8 +16326,8 @@ define <28 x float> @bitcast_v56i16_to_v28f32(<56 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; SI-NEXT: v_mov_b32_e32 v54, v2 ; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 ; SI-NEXT: v_mov_b32_e32 v53, v4 ; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v3 @@ -16341,70 +16346,70 @@ define <28 x float> @bitcast_v56i16_to_v28f32(<56 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v29 ; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v0 -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:84 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:84 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:76 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:76 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v2 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v4 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:68 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:68 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v4 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:100 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:16 ; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:32 ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:60 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v6 ; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:48 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:56 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v18 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v14 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v16 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v18 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:52 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v22 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v24 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v26 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v26 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v22 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -17570,14 +17575,14 @@ define inreg <28 x float> @bitcast_v56i16_to_v28f32_scalar(<56 x i16> inreg %a, ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:36 ; SI-NEXT: v_mov_b32_e32 v31, v26 ; SI-NEXT: v_mov_b32_e32 v41, v24 @@ -17612,14 +17617,13 @@ define inreg <28 x float> @bitcast_v56i16_to_v28f32_scalar(<56 x i16> inreg %a, ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v2 ; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v4 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v12 ; SI-NEXT: s_cbranch_scc0 .LBB31_4 ; SI-NEXT: ; %bb.1: ; %cmp.false @@ -17667,21 +17671,25 @@ define inreg <28 x float> @bitcast_v56i16_to_v28f32_scalar(<56 x i16> inreg %a, ; SI-NEXT: s_and_b32 s6, s20, 0xffff ; SI-NEXT: s_lshl_b32 s7, s21, 16 ; SI-NEXT: v_or_b32_e32 v22, v0, v54 +; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v56 ; SI-NEXT: s_or_b32 s6, s6, s7 ; SI-NEXT: s_and_b32 s7, s22, 0xffff ; SI-NEXT: s_lshl_b32 s8, s23, 16 ; SI-NEXT: v_or_b32_e32 v23, v0, v32 +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 ; SI-NEXT: s_or_b32 s7, s7, s8 ; SI-NEXT: s_and_b32 s8, s24, 0xffff ; SI-NEXT: s_lshl_b32 s9, s25, 16 ; SI-NEXT: v_or_b32_e32 v24, v0, v59 +; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v47 ; SI-NEXT: s_or_b32 s8, s8, s9 ; SI-NEXT: s_and_b32 s9, s26, 0xffff ; SI-NEXT: s_lshl_b32 s10, s27, 16 ; SI-NEXT: v_or_b32_e32 v25, v0, v53 +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v46 ; SI-NEXT: s_or_b32 s9, s9, s10 ; SI-NEXT: s_and_b32 s10, s28, 0xffff @@ -17863,7 +17871,9 @@ define inreg <28 x float> @bitcast_v56i16_to_v28f32_scalar(<56 x i16> inreg %a, ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v62, v58 +; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_mov_b32_e32 v58, v51 +; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_mov_b32_e32 v51, v47 ; SI-NEXT: v_mov_b32_e32 v47, v44 ; SI-NEXT: v_mov_b32_e32 v44, v41 @@ -27111,8 +27121,8 @@ define <14 x i64> @bitcast_v56i16_to_v14i64(<56 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; SI-NEXT: v_mov_b32_e32 v54, v2 ; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 ; SI-NEXT: v_mov_b32_e32 v53, v4 ; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v3 @@ -27131,70 +27141,70 @@ define <14 x i64> @bitcast_v56i16_to_v14i64(<56 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v29 ; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v0 -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:84 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:84 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:76 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:76 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v2 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v4 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:68 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:68 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v4 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:100 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:16 ; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:32 ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:60 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v6 ; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:48 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:56 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v18 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v14 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v16 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v18 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:52 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v22 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v24 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v26 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v26 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v22 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -28360,14 +28370,14 @@ define inreg <14 x i64> @bitcast_v56i16_to_v14i64_scalar(<56 x i16> inreg %a, i3 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:36 ; SI-NEXT: v_mov_b32_e32 v31, v26 ; SI-NEXT: v_mov_b32_e32 v41, v24 @@ -28402,14 +28412,13 @@ define inreg <14 x i64> @bitcast_v56i16_to_v14i64_scalar(<56 x i16> inreg %a, i3 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v2 ; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v4 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v12 ; SI-NEXT: s_cbranch_scc0 .LBB43_4 ; SI-NEXT: ; %bb.1: ; %cmp.false @@ -28457,21 +28466,25 @@ define inreg <14 x i64> @bitcast_v56i16_to_v14i64_scalar(<56 x i16> inreg %a, i3 ; SI-NEXT: s_and_b32 s6, s20, 0xffff ; SI-NEXT: s_lshl_b32 s7, s21, 16 ; SI-NEXT: v_or_b32_e32 v22, v0, v54 +; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v56 ; SI-NEXT: s_or_b32 s6, s6, s7 ; SI-NEXT: s_and_b32 s7, s22, 0xffff ; SI-NEXT: s_lshl_b32 s8, s23, 16 ; SI-NEXT: v_or_b32_e32 v23, v0, v32 +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 ; SI-NEXT: s_or_b32 s7, s7, s8 ; SI-NEXT: s_and_b32 s8, s24, 0xffff ; SI-NEXT: s_lshl_b32 s9, s25, 16 ; SI-NEXT: v_or_b32_e32 v24, v0, v59 +; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v47 ; SI-NEXT: s_or_b32 s8, s8, s9 ; SI-NEXT: s_and_b32 s9, s26, 0xffff ; SI-NEXT: s_lshl_b32 s10, s27, 16 ; SI-NEXT: v_or_b32_e32 v25, v0, v53 +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v46 ; SI-NEXT: s_or_b32 s9, s9, s10 ; SI-NEXT: s_and_b32 s10, s28, 0xffff @@ -28653,7 +28666,9 @@ define inreg <14 x i64> @bitcast_v56i16_to_v14i64_scalar(<56 x i16> inreg %a, i3 ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v62, v58 +; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_mov_b32_e32 v58, v51 +; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_mov_b32_e32 v51, v47 ; SI-NEXT: v_mov_b32_e32 v47, v44 ; SI-NEXT: v_mov_b32_e32 v44, v41 @@ -36929,8 +36944,8 @@ define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; SI-NEXT: v_mov_b32_e32 v54, v2 ; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 ; SI-NEXT: v_mov_b32_e32 v53, v4 ; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v3 @@ -36949,70 +36964,70 @@ define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v29 ; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v0 -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:84 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:84 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:76 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:76 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v2 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v4 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:68 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:68 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v4 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:100 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:16 ; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:32 ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:60 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v6 ; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:48 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:56 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v18 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v14 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v16 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v18 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:52 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v22 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v24 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v26 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v26 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v22 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -38178,14 +38193,14 @@ define inreg <14 x double> @bitcast_v56i16_to_v14f64_scalar(<56 x i16> inreg %a, ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:36 ; SI-NEXT: v_mov_b32_e32 v31, v26 ; SI-NEXT: v_mov_b32_e32 v41, v24 @@ -38220,14 +38235,13 @@ define inreg <14 x double> @bitcast_v56i16_to_v14f64_scalar(<56 x i16> inreg %a, ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v2 ; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v4 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v12 ; SI-NEXT: s_cbranch_scc0 .LBB51_4 ; SI-NEXT: ; %bb.1: ; %cmp.false @@ -38275,21 +38289,25 @@ define inreg <14 x double> @bitcast_v56i16_to_v14f64_scalar(<56 x i16> inreg %a, ; SI-NEXT: s_and_b32 s6, s20, 0xffff ; SI-NEXT: s_lshl_b32 s7, s21, 16 ; SI-NEXT: v_or_b32_e32 v22, v0, v54 +; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v56 ; SI-NEXT: s_or_b32 s6, s6, s7 ; SI-NEXT: s_and_b32 s7, s22, 0xffff ; SI-NEXT: s_lshl_b32 s8, s23, 16 ; SI-NEXT: v_or_b32_e32 v23, v0, v32 +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 ; SI-NEXT: s_or_b32 s7, s7, s8 ; SI-NEXT: s_and_b32 s8, s24, 0xffff ; SI-NEXT: s_lshl_b32 s9, s25, 16 ; SI-NEXT: v_or_b32_e32 v24, v0, v59 +; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v47 ; SI-NEXT: s_or_b32 s8, s8, s9 ; SI-NEXT: s_and_b32 s9, s26, 0xffff ; SI-NEXT: s_lshl_b32 s10, s27, 16 ; SI-NEXT: v_or_b32_e32 v25, v0, v53 +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v46 ; SI-NEXT: s_or_b32 s9, s9, s10 ; SI-NEXT: s_and_b32 s10, s28, 0xffff @@ -38471,7 +38489,9 @@ define inreg <14 x double> @bitcast_v56i16_to_v14f64_scalar(<56 x i16> inreg %a, ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v62, v58 +; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_mov_b32_e32 v58, v51 +; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_mov_b32_e32 v51, v47 ; SI-NEXT: v_mov_b32_e32 v47, v44 ; SI-NEXT: v_mov_b32_e32 v44, v41 @@ -44804,6 +44824,7 @@ define <56 x half> @bitcast_v56i16_to_v56f16(<56 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:104 ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:100 ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:96 ; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:92 @@ -44823,7 +44844,6 @@ define <56 x half> @bitcast_v56i16_to_v56f16(<56 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:104 ; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:36 ; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:32 ; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:28 @@ -44838,7 +44858,7 @@ define <56 x half> @bitcast_v56i16_to_v56f16(<56 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v55 ; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; kill: killed $vgpr55 @@ -45051,7 +45071,7 @@ define <56 x half> @bitcast_v56i16_to_v56f16(<56 x i16> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v1, v30 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v46 ; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill @@ -45059,7 +45079,7 @@ define <56 x half> @bitcast_v56i16_to_v56f16(<56 x i16> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v1, v47 ; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v39 ; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill @@ -45217,6 +45237,7 @@ define <56 x half> @bitcast_v56i16_to_v56f16(<56 x i16> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v18 ; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_add_i32_e32 v46, vcc, 3, v46 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -46260,8 +46281,8 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:4 ; SI-NEXT: s_waitcnt expcnt(6) ; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll index 93c11f13ce3ce..462239804f415 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll @@ -5003,10 +5003,11 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:12 ; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v3 ; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v5 @@ -5024,74 +5025,73 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v29 ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:60 -; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:4 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:60 +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v4 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:108 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:112 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:108 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v4 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:100 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:104 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:100 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v8 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v10 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:84 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:88 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:84 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v12 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:40 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:24 ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v14 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v16 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v18 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v20 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v22 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v24 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:68 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:68 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v20 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v18 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v24 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v26 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:56 @@ -5099,6 +5099,8 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v26 ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v28 ; SI-NEXT: s_waitcnt vmcnt(2) @@ -6350,18 +6352,18 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:36 ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:44 ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:44 ; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:52 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v30, v28 @@ -6401,13 +6403,12 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v2 ; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v4 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v6 ; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v8 ; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v10 ; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v12 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v16 ; SI-NEXT: s_cbranch_scc0 .LBB15_4 ; SI-NEXT: ; %bb.1: ; %cmp.false @@ -6701,6 +6702,7 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v61, v52 ; SI-NEXT: v_mov_b32_e32 v52, v59 ; SI-NEXT: v_mov_b32_e32 v59, v51 +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_mov_b32_e32 v51, v57 ; SI-NEXT: v_mov_b32_e32 v57, v50 ; SI-NEXT: v_mov_b32_e32 v50, v47 @@ -10355,17 +10357,17 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:16 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:32 ; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:24 ; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:32 ; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:40 ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:36 ; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:48 ; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:64 ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:56 ; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:64 ; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:72 ; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:68 @@ -17541,10 +17543,11 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:12 ; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v3 ; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v5 @@ -17562,74 +17565,73 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v29 ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:60 -; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:4 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:60 +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v4 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:108 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:112 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:108 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v4 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:100 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:104 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:100 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v8 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v10 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:84 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:88 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:84 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v12 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:40 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:24 ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v14 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v16 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v18 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v20 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v22 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v24 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:68 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:68 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v20 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v18 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v24 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v26 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:56 @@ -17637,6 +17639,8 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v26 ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v28 ; SI-NEXT: s_waitcnt vmcnt(2) @@ -18888,18 +18892,18 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a, ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:36 ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:44 ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:44 ; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:52 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v30, v28 @@ -18939,13 +18943,12 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a, ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v2 ; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v4 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v6 ; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v8 ; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v10 ; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v12 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v16 ; SI-NEXT: s_cbranch_scc0 .LBB31_4 ; SI-NEXT: ; %bb.1: ; %cmp.false @@ -19239,6 +19242,7 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a, ; SI-NEXT: v_mov_b32_e32 v61, v52 ; SI-NEXT: v_mov_b32_e32 v52, v59 ; SI-NEXT: v_mov_b32_e32 v59, v51 +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_mov_b32_e32 v51, v57 ; SI-NEXT: v_mov_b32_e32 v57, v50 ; SI-NEXT: v_mov_b32_e32 v50, v47 @@ -23054,17 +23058,17 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:16 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:32 ; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:24 ; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:32 ; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:40 ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:36 ; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:48 ; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:64 ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:56 ; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:64 ; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:72 ; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:68 @@ -29211,10 +29215,11 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:12 ; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v3 ; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v5 @@ -29232,74 +29237,73 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v29 ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:60 -; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:4 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:60 +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v4 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:108 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:112 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:108 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v4 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:100 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:104 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:100 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v8 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v10 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:84 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:88 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:84 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v12 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:40 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:24 ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v14 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v16 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v18 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v20 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v22 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v24 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:68 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:68 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v20 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v18 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v24 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v26 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:56 @@ -29307,6 +29311,8 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v26 ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v28 ; SI-NEXT: s_waitcnt vmcnt(2) @@ -30558,18 +30564,18 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:36 ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:44 ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:44 ; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:52 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v30, v28 @@ -30609,13 +30615,12 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v2 ; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v4 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v6 ; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v8 ; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v10 ; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v12 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v16 ; SI-NEXT: s_cbranch_scc0 .LBB43_4 ; SI-NEXT: ; %bb.1: ; %cmp.false @@ -30909,6 +30914,7 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v61, v52 ; SI-NEXT: v_mov_b32_e32 v52, v59 ; SI-NEXT: v_mov_b32_e32 v59, v51 +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_mov_b32_e32 v51, v57 ; SI-NEXT: v_mov_b32_e32 v57, v50 ; SI-NEXT: v_mov_b32_e32 v50, v47 @@ -34580,17 +34586,17 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:16 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:32 ; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:24 ; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:32 ; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:40 ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:36 ; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:48 ; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:64 ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:56 ; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:64 ; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:72 ; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:68 @@ -39859,10 +39865,11 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:12 ; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v3 ; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v5 @@ -39880,74 +39887,73 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v29 ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:60 -; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:4 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:60 +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v4 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:108 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:112 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:108 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v4 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:100 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:104 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:100 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v8 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v10 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:84 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:88 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:84 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v12 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:40 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:24 ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v14 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v16 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v18 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v20 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v22 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v24 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:68 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:68 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v20 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v18 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v24 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v26 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:56 @@ -39955,6 +39961,8 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v26 ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v28 ; SI-NEXT: s_waitcnt vmcnt(2) @@ -41206,18 +41214,18 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a, ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:36 ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:44 ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:44 ; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:52 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v30, v28 @@ -41257,13 +41265,12 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a, ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v2 ; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v4 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v6 ; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v8 ; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v10 ; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v12 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v16 ; SI-NEXT: s_cbranch_scc0 .LBB51_4 ; SI-NEXT: ; %bb.1: ; %cmp.false @@ -41557,6 +41564,7 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a, ; SI-NEXT: v_mov_b32_e32 v61, v52 ; SI-NEXT: v_mov_b32_e32 v52, v59 ; SI-NEXT: v_mov_b32_e32 v59, v51 +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_mov_b32_e32 v51, v57 ; SI-NEXT: v_mov_b32_e32 v57, v50 ; SI-NEXT: v_mov_b32_e32 v50, v47 @@ -45272,17 +45280,17 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:16 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:32 ; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:24 ; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:32 ; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:40 ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:36 ; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:48 ; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:64 ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:56 ; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:64 ; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:72 ; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:68 @@ -49981,6 +49989,7 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:20 ; SI-NEXT: s_waitcnt expcnt(5) ; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:60 ; SI-NEXT: s_waitcnt expcnt(4) @@ -49997,7 +50006,6 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:32 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:16 ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:8 diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll index 30ad46d959b7e..34ebe60dd1e03 100644 --- a/llvm/test/CodeGen/AMDGPU/bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/bf16.ll @@ -815,13 +815,13 @@ define <64 x bfloat> @v_load_global_v64bf16(ptr addrspace(1) %ptr) { ; GCN-NEXT: buffer_store_dword v5, v11, s[0:3], 0 offen ; GCN-NEXT: buffer_store_dword v4, v12, s[0:3], 0 offen ; GCN-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dwordx4 v[7:10], v[1:2], s[4:7], 0 addr64 offset:48 ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:32 -; GCN-NEXT: buffer_load_dwordx4 v[7:10], v[1:2], s[4:7], 0 addr64 offset:48 ; GCN-NEXT: v_add_i32_e32 v21, vcc, 52, v0 -; GCN-NEXT: buffer_load_dwordx4 v[11:14], v[1:2], s[4:7], 0 addr64 ; GCN-NEXT: buffer_load_dwordx4 v[15:18], v[1:2], s[4:7], 0 addr64 offset:16 -; GCN-NEXT: s_waitcnt vmcnt(2) +; GCN-NEXT: buffer_load_dwordx4 v[11:14], v[1:2], s[4:7], 0 addr64 +; GCN-NEXT: s_waitcnt vmcnt(3) ; GCN-NEXT: buffer_store_dword v10, v19, s[0:3], 0 offen ; GCN-NEXT: v_add_i32_e32 v1, vcc, 48, v0 ; GCN-NEXT: buffer_store_dword v9, v20, s[0:3], 0 offen @@ -836,6 +836,7 @@ define <64 x bfloat> @v_load_global_v64bf16(ptr addrspace(1) %ptr) { ; GCN-NEXT: v_add_i32_e32 v9, vcc, 28, v0 ; GCN-NEXT: v_add_i32_e32 v10, vcc, 24, v0 ; GCN-NEXT: v_add_i32_e32 v19, vcc, 20, v0 +; GCN-NEXT: s_waitcnt vmcnt(6) ; GCN-NEXT: buffer_store_dword v6, v2, s[0:3], 0 offen ; GCN-NEXT: v_add_i32_e32 v2, vcc, 16, v0 ; GCN-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen @@ -846,11 +847,12 @@ define <64 x bfloat> @v_load_global_v64bf16(ptr addrspace(1) %ptr) { ; GCN-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_add_i32_e32 v3, vcc, 4, v0 -; GCN-NEXT: s_waitcnt vmcnt(8) +; GCN-NEXT: s_waitcnt vmcnt(9) ; GCN-NEXT: buffer_store_dword v18, v9, s[0:3], 0 offen ; GCN-NEXT: buffer_store_dword v17, v10, s[0:3], 0 offen ; GCN-NEXT: buffer_store_dword v16, v19, s[0:3], 0 offen ; GCN-NEXT: buffer_store_dword v15, v2, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(12) ; GCN-NEXT: buffer_store_dword v14, v5, s[0:3], 0 offen ; GCN-NEXT: buffer_store_dword v13, v1, s[0:3], 0 offen ; GCN-NEXT: buffer_store_dword v12, v3, s[0:3], 0 offen @@ -9024,6 +9026,21 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) { ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s4, s6 ; GCN-NEXT: s_mov_b32 s5, s6 +; GCN-NEXT: buffer_load_ushort v30, v[1:2], s[4:7], 0 addr64 offset:62 +; GCN-NEXT: buffer_load_ushort v29, v[1:2], s[4:7], 0 addr64 offset:60 +; GCN-NEXT: buffer_load_ushort v28, v[1:2], s[4:7], 0 addr64 offset:58 +; GCN-NEXT: buffer_load_ushort v27, v[1:2], s[4:7], 0 addr64 offset:56 +; GCN-NEXT: buffer_load_ushort v26, v[1:2], s[4:7], 0 addr64 offset:54 +; GCN-NEXT: buffer_load_ushort v25, v[1:2], s[4:7], 0 addr64 offset:52 +; GCN-NEXT: buffer_load_ushort v24, v[1:2], s[4:7], 0 addr64 offset:50 +; GCN-NEXT: buffer_load_ushort v23, v[1:2], s[4:7], 0 addr64 offset:48 +; GCN-NEXT: buffer_load_ushort v34, v[1:2], s[4:7], 0 addr64 offset:46 +; GCN-NEXT: buffer_load_ushort v33, v[1:2], s[4:7], 0 addr64 offset:44 +; GCN-NEXT: buffer_load_ushort v32, v[1:2], s[4:7], 0 addr64 offset:42 +; GCN-NEXT: buffer_load_ushort v31, v[1:2], s[4:7], 0 addr64 offset:40 +; GCN-NEXT: buffer_load_ushort v22, v[1:2], s[4:7], 0 addr64 offset:38 +; GCN-NEXT: buffer_load_ushort v21, v[1:2], s[4:7], 0 addr64 offset:36 +; GCN-NEXT: buffer_load_ushort v20, v[1:2], s[4:7], 0 addr64 offset:34 ; GCN-NEXT: buffer_load_ushort v3, v[1:2], s[4:7], 0 addr64 ; GCN-NEXT: buffer_load_ushort v4, v[1:2], s[4:7], 0 addr64 offset:2 ; GCN-NEXT: buffer_load_ushort v5, v[1:2], s[4:7], 0 addr64 offset:4 @@ -9040,23 +9057,8 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) { ; GCN-NEXT: buffer_load_ushort v16, v[1:2], s[4:7], 0 addr64 offset:26 ; GCN-NEXT: buffer_load_ushort v17, v[1:2], s[4:7], 0 addr64 offset:28 ; GCN-NEXT: buffer_load_ushort v18, v[1:2], s[4:7], 0 addr64 offset:30 -; GCN-NEXT: buffer_load_ushort v23, v[1:2], s[4:7], 0 addr64 offset:48 -; GCN-NEXT: buffer_load_ushort v24, v[1:2], s[4:7], 0 addr64 offset:50 -; GCN-NEXT: buffer_load_ushort v25, v[1:2], s[4:7], 0 addr64 offset:52 -; GCN-NEXT: buffer_load_ushort v26, v[1:2], s[4:7], 0 addr64 offset:54 -; GCN-NEXT: buffer_load_ushort v27, v[1:2], s[4:7], 0 addr64 offset:56 -; GCN-NEXT: buffer_load_ushort v28, v[1:2], s[4:7], 0 addr64 offset:58 -; GCN-NEXT: buffer_load_ushort v29, v[1:2], s[4:7], 0 addr64 offset:60 -; GCN-NEXT: buffer_load_ushort v30, v[1:2], s[4:7], 0 addr64 offset:62 ; GCN-NEXT: buffer_load_ushort v19, v[1:2], s[4:7], 0 addr64 offset:32 -; GCN-NEXT: buffer_load_ushort v20, v[1:2], s[4:7], 0 addr64 offset:34 -; GCN-NEXT: buffer_load_ushort v21, v[1:2], s[4:7], 0 addr64 offset:36 -; GCN-NEXT: buffer_load_ushort v22, v[1:2], s[4:7], 0 addr64 offset:38 -; GCN-NEXT: buffer_load_ushort v31, v[1:2], s[4:7], 0 addr64 offset:40 -; GCN-NEXT: buffer_load_ushort v32, v[1:2], s[4:7], 0 addr64 offset:42 -; GCN-NEXT: buffer_load_ushort v33, v[1:2], s[4:7], 0 addr64 offset:44 -; GCN-NEXT: buffer_load_ushort v34, v[1:2], s[4:7], 0 addr64 offset:46 -; GCN-NEXT: s_waitcnt vmcnt(8) +; GCN-NEXT: s_waitcnt vmcnt(14) ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v30 ; GCN-NEXT: v_add_i32_e32 v30, vcc, 0xfc, v0 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 @@ -9122,7 +9124,7 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) { ; GCN-NEXT: buffer_store_dword v1, v27, s[0:3], 0 offen ; GCN-NEXT: v_add_i32_e32 v26, vcc, 0xac, v0 ; GCN-NEXT: v_add_i32_e32 v27, vcc, 0xa8, v0 -; GCN-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v34 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 ; GCN-NEXT: buffer_store_dword v2, v24, s[0:3], 0 offen @@ -9178,7 +9180,7 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) { ; GCN-NEXT: buffer_store_dword v1, v27, s[0:3], 0 offen ; GCN-NEXT: v_add_i32_e32 v26, vcc, 0x58, v0 ; GCN-NEXT: v_add_i32_e32 v27, vcc, 0x54, v0 -; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: s_waitcnt vmcnt(14) expcnt(0) ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v19 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 ; GCN-NEXT: buffer_store_dword v2, v32, s[0:3], 0 offen @@ -9294,15 +9296,16 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) { ; GFX7-NEXT: buffer_load_ushort v22, v[1:2], s[4:7], 0 addr64 offset:52 ; GFX7-NEXT: buffer_load_ushort v23, v[1:2], s[4:7], 0 addr64 offset:50 ; GFX7-NEXT: buffer_load_ushort v24, v[1:2], s[4:7], 0 addr64 offset:48 -; GFX7-NEXT: buffer_load_ushort v16, v[1:2], s[4:7], 0 addr64 offset:32 -; GFX7-NEXT: buffer_load_ushort v25, v[1:2], s[4:7], 0 addr64 offset:34 -; GFX7-NEXT: buffer_load_ushort v26, v[1:2], s[4:7], 0 addr64 offset:36 -; GFX7-NEXT: buffer_load_ushort v27, v[1:2], s[4:7], 0 addr64 offset:38 -; GFX7-NEXT: buffer_load_ushort v28, v[1:2], s[4:7], 0 addr64 offset:40 -; GFX7-NEXT: buffer_load_ushort v29, v[1:2], s[4:7], 0 addr64 offset:42 -; GFX7-NEXT: buffer_load_ushort v30, v[1:2], s[4:7], 0 addr64 offset:44 ; GFX7-NEXT: buffer_load_ushort v31, v[1:2], s[4:7], 0 addr64 offset:46 +; GFX7-NEXT: buffer_load_ushort v30, v[1:2], s[4:7], 0 addr64 offset:44 +; GFX7-NEXT: buffer_load_ushort v29, v[1:2], s[4:7], 0 addr64 offset:42 +; GFX7-NEXT: buffer_load_ushort v28, v[1:2], s[4:7], 0 addr64 offset:40 +; GFX7-NEXT: buffer_load_ushort v27, v[1:2], s[4:7], 0 addr64 offset:38 +; GFX7-NEXT: buffer_load_ushort v26, v[1:2], s[4:7], 0 addr64 offset:36 +; GFX7-NEXT: buffer_load_ushort v25, v[1:2], s[4:7], 0 addr64 offset:34 +; GFX7-NEXT: buffer_load_ushort v16, v[1:2], s[4:7], 0 addr64 offset:32 ; GFX7-NEXT: buffer_load_ushort v32, v[1:2], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_ushort v34, v[1:2], s[4:7], 0 addr64 offset:30 ; GFX7-NEXT: buffer_load_ushort v15, v[1:2], s[4:7], 0 addr64 offset:2 ; GFX7-NEXT: buffer_load_ushort v13, v[1:2], s[4:7], 0 addr64 offset:4 ; GFX7-NEXT: buffer_load_ushort v11, v[1:2], s[4:7], 0 addr64 offset:6 @@ -9317,7 +9320,6 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) { ; GFX7-NEXT: buffer_load_ushort v12, v[1:2], s[4:7], 0 addr64 offset:24 ; GFX7-NEXT: buffer_load_ushort v14, v[1:2], s[4:7], 0 addr64 offset:26 ; GFX7-NEXT: buffer_load_ushort v33, v[1:2], s[4:7], 0 addr64 offset:28 -; GFX7-NEXT: buffer_load_ushort v34, v[1:2], s[4:7], 0 addr64 offset:30 ; GFX7-NEXT: s_waitcnt vmcnt(14) ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v17 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 @@ -9419,7 +9421,6 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) { ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x80, v0 ; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v16 -; GFX7-NEXT: s_waitcnt vmcnt(14) ; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v34 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[16:17], v16 ; GFX7-NEXT: v_add_i32_e32 v18, vcc, 0x7c, v0 @@ -9427,6 +9428,7 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) { ; GFX7-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen ; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0x78, v0 ; GFX7-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(14) ; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v33 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[17:18], v17 ; GFX7-NEXT: v_lshlrev_b32_e32 v11, 16, v11 @@ -9597,14 +9599,14 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) { ; GFX8-NEXT: v_add_u32_e32 v42, vcc, 50, v1 ; GFX8-NEXT: v_addc_u32_e32 v43, vcc, 0, v2, vcc ; GFX8-NEXT: flat_load_ushort v42, v[42:43] -; GFX8-NEXT: flat_load_ushort v34, v[33:34] -; GFX8-NEXT: flat_load_ushort v36, v[35:36] -; GFX8-NEXT: flat_load_ushort v38, v[37:38] ; GFX8-NEXT: flat_load_ushort v39, v[48:49] ; GFX8-NEXT: flat_load_ushort v48, v[50:51] ; GFX8-NEXT: flat_load_ushort v51, v[52:53] -; GFX8-NEXT: flat_load_ushort v52, v[54:55] ; GFX8-NEXT: flat_load_ushort v53, v[40:41] +; GFX8-NEXT: flat_load_ushort v52, v[54:55] +; GFX8-NEXT: flat_load_ushort v38, v[37:38] +; GFX8-NEXT: flat_load_ushort v34, v[33:34] +; GFX8-NEXT: flat_load_ushort v36, v[35:36] ; GFX8-NEXT: v_add_u32_e32 v49, vcc, 32, v1 ; GFX8-NEXT: v_addc_u32_e32 v50, vcc, 0, v2, vcc ; GFX8-NEXT: flat_load_ushort v37, v[3:4] @@ -9620,6 +9622,7 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) { ; GFX8-NEXT: s_waitcnt vmcnt(14) ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v44 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[14:15], v3 +; GFX8-NEXT: flat_load_ushort v13, v[49:50] ; GFX8-NEXT: flat_load_ushort v3, v[17:18] ; GFX8-NEXT: flat_load_ushort v5, v[21:22] ; GFX8-NEXT: flat_load_ushort v7, v[23:24] @@ -9627,7 +9630,6 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) { ; GFX8-NEXT: flat_load_ushort v10, v[27:28] ; GFX8-NEXT: flat_load_ushort v11, v[29:30] ; GFX8-NEXT: flat_load_ushort v12, v[31:32] -; GFX8-NEXT: flat_load_ushort v13, v[49:50] ; GFX8-NEXT: v_add_u32_e32 v18, vcc, 0x84, v0 ; GFX8-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen ; GFX8-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen @@ -9848,22 +9850,22 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) { ; GFX900-NEXT: global_load_ushort v22, v[1:2], off offset:38 ; GFX900-NEXT: global_load_ushort v23, v[1:2], off offset:36 ; GFX900-NEXT: global_load_ushort v24, v[1:2], off offset:34 -; GFX900-NEXT: global_load_ushort v25, v[1:2], off offset:32 ; GFX900-NEXT: global_load_ushort v26, v[1:2], off +; GFX900-NEXT: global_load_ushort v25, v[1:2], off offset:32 ; GFX900-NEXT: global_load_ushort v27, v[1:2], off offset:2 -; GFX900-NEXT: global_load_ushort v3, v[1:2], off offset:16 -; GFX900-NEXT: global_load_ushort v4, v[1:2], off offset:18 -; GFX900-NEXT: global_load_ushort v5, v[1:2], off offset:20 -; GFX900-NEXT: global_load_ushort v6, v[1:2], off offset:22 -; GFX900-NEXT: global_load_ushort v8, v[1:2], off offset:24 ; GFX900-NEXT: global_load_ushort v28, v[1:2], off offset:30 -; GFX900-NEXT: global_load_ushort v29, v[1:2], off offset:26 -; GFX900-NEXT: global_load_ushort v30, v[1:2], off offset:28 ; GFX900-NEXT: global_load_ushort v31, v[1:2], off offset:4 +; GFX900-NEXT: global_load_ushort v30, v[1:2], off offset:28 ; GFX900-NEXT: global_load_ushort v32, v[1:2], off offset:6 +; GFX900-NEXT: global_load_ushort v29, v[1:2], off offset:26 ; GFX900-NEXT: global_load_ushort v33, v[1:2], off offset:8 +; GFX900-NEXT: global_load_ushort v8, v[1:2], off offset:24 ; GFX900-NEXT: global_load_ushort v34, v[1:2], off offset:10 +; GFX900-NEXT: global_load_ushort v6, v[1:2], off offset:22 ; GFX900-NEXT: global_load_ushort v7, v[1:2], off offset:12 +; GFX900-NEXT: global_load_ushort v5, v[1:2], off offset:20 +; GFX900-NEXT: global_load_ushort v3, v[1:2], off offset:16 +; GFX900-NEXT: global_load_ushort v4, v[1:2], off offset:18 ; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: global_load_ushort v1, v[1:2], off offset:14 ; GFX900-NEXT: s_waitcnt vmcnt(31) @@ -9936,21 +9938,22 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) { ; GFX900-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:148 ; GFX900-NEXT: v_cvt_f64_f32_e32 v[9:10], v2 ; GFX900-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:144 -; GFX900-NEXT: s_waitcnt vmcnt(44) +; GFX900-NEXT: s_waitcnt vmcnt(43) ; GFX900-NEXT: v_lshlrev_b32_e32 v11, 16, v25 ; GFX900-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:140 ; GFX900-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:136 ; GFX900-NEXT: v_cvt_f64_f32_e32 v[9:10], v11 -; GFX900-NEXT: s_waitcnt vmcnt(38) +; GFX900-NEXT: s_waitcnt vmcnt(43) ; GFX900-NEXT: v_lshlrev_b32_e32 v13, 16, v28 ; GFX900-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:132 ; GFX900-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:128 ; GFX900-NEXT: v_cvt_f64_f32_e32 v[9:10], v13 -; GFX900-NEXT: s_waitcnt vmcnt(38) +; GFX900-NEXT: s_waitcnt vmcnt(43) ; GFX900-NEXT: v_lshlrev_b32_e32 v15, 16, v30 ; GFX900-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:124 ; GFX900-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:120 ; GFX900-NEXT: v_cvt_f64_f32_e32 v[9:10], v15 +; GFX900-NEXT: s_waitcnt vmcnt(43) ; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v29 ; GFX900-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:116 ; GFX900-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:112 @@ -9959,29 +9962,28 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) { ; GFX900-NEXT: v_cvt_f64_f32_e32 v[11:12], v2 ; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v27 ; GFX900-NEXT: v_cvt_f64_f32_e32 v[13:14], v2 -; GFX900-NEXT: s_waitcnt vmcnt(41) ; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v31 +; GFX900-NEXT: s_waitcnt vmcnt(43) ; GFX900-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; GFX900-NEXT: v_cvt_f64_f32_e32 v[15:16], v2 -; GFX900-NEXT: s_waitcnt vmcnt(40) ; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v32 ; GFX900-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:108 ; GFX900-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:104 ; GFX900-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 ; GFX900-NEXT: v_cvt_f64_f32_e32 v[17:18], v2 -; GFX900-NEXT: s_waitcnt vmcnt(41) ; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v33 ; GFX900-NEXT: v_cvt_f64_f32_e32 v[19:20], v2 -; GFX900-NEXT: s_waitcnt vmcnt(40) +; GFX900-NEXT: s_waitcnt vmcnt(44) ; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; GFX900-NEXT: s_waitcnt vmcnt(43) ; GFX900-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; GFX900-NEXT: v_cvt_f64_f32_e32 v[21:22], v2 +; GFX900-NEXT: s_waitcnt vmcnt(41) ; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v5 ; GFX900-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:100 ; GFX900-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:96 ; GFX900-NEXT: v_cvt_f64_f32_e32 v[8:9], v6 ; GFX900-NEXT: v_cvt_f64_f32_e32 v[5:6], v2 -; GFX900-NEXT: s_waitcnt vmcnt(41) ; GFX900-NEXT: v_lshlrev_b32_e32 v10, 16, v7 ; GFX900-NEXT: s_waitcnt vmcnt(40) ; GFX900-NEXT: v_lshlrev_b32_e32 v7, 16, v1 @@ -10032,37 +10034,37 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) { ; GFX950-NEXT: v_accvgpr_write_b32 a10, v58 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_write_b32 a12, v60 ; Reload Reuse ; GFX950-NEXT: global_load_ushort v1, v[2:3], off offset:2 -; GFX950-NEXT: global_load_ushort v4, v[2:3], off offset:12 -; GFX950-NEXT: global_load_ushort v5, v[2:3], off offset:8 -; GFX950-NEXT: global_load_ushort v6, v[2:3], off offset:4 ; GFX950-NEXT: global_load_ushort v7, v[2:3], off ; GFX950-NEXT: global_load_ushort v8, v[2:3], off offset:6 +; GFX950-NEXT: global_load_ushort v6, v[2:3], off offset:4 ; GFX950-NEXT: global_load_ushort v9, v[2:3], off offset:10 +; GFX950-NEXT: global_load_ushort v5, v[2:3], off offset:8 ; GFX950-NEXT: global_load_ushort v10, v[2:3], off offset:14 +; GFX950-NEXT: global_load_ushort v4, v[2:3], off offset:12 ; GFX950-NEXT: global_load_ushort v11, v[2:3], off offset:18 -; GFX950-NEXT: global_load_ushort v12, v[2:3], off offset:28 -; GFX950-NEXT: global_load_ushort v13, v[2:3], off offset:24 -; GFX950-NEXT: global_load_ushort v14, v[2:3], off offset:20 ; GFX950-NEXT: global_load_ushort v15, v[2:3], off offset:16 ; GFX950-NEXT: global_load_ushort v16, v[2:3], off offset:22 +; GFX950-NEXT: global_load_ushort v14, v[2:3], off offset:20 ; GFX950-NEXT: global_load_ushort v17, v[2:3], off offset:26 +; GFX950-NEXT: global_load_ushort v13, v[2:3], off offset:24 ; GFX950-NEXT: global_load_ushort v18, v[2:3], off offset:30 +; GFX950-NEXT: global_load_ushort v12, v[2:3], off offset:28 ; GFX950-NEXT: global_load_ushort v19, v[2:3], off offset:34 -; GFX950-NEXT: global_load_ushort v20, v[2:3], off offset:44 -; GFX950-NEXT: global_load_ushort v21, v[2:3], off offset:40 -; GFX950-NEXT: global_load_ushort v22, v[2:3], off offset:36 ; GFX950-NEXT: global_load_ushort v23, v[2:3], off offset:32 ; GFX950-NEXT: global_load_ushort v24, v[2:3], off offset:38 +; GFX950-NEXT: global_load_ushort v22, v[2:3], off offset:36 ; GFX950-NEXT: global_load_ushort v25, v[2:3], off offset:42 +; GFX950-NEXT: global_load_ushort v21, v[2:3], off offset:40 ; GFX950-NEXT: global_load_ushort v26, v[2:3], off offset:46 +; GFX950-NEXT: global_load_ushort v20, v[2:3], off offset:44 ; GFX950-NEXT: global_load_ushort v42, v[2:3], off offset:50 ; GFX950-NEXT: global_load_ushort v43, v[2:3], off offset:62 ; GFX950-NEXT: global_load_ushort v46, v[2:3], off offset:60 -; GFX950-NEXT: global_load_ushort v47, v[2:3], off offset:56 -; GFX950-NEXT: global_load_ushort v60, v[2:3], off offset:52 ; GFX950-NEXT: global_load_ushort v56, v[2:3], off offset:48 ; GFX950-NEXT: global_load_ushort v57, v[2:3], off offset:54 ; GFX950-NEXT: global_load_ushort v58, v[2:3], off offset:58 +; GFX950-NEXT: global_load_ushort v47, v[2:3], off offset:56 +; GFX950-NEXT: global_load_ushort v60, v[2:3], off offset:52 ; GFX950-NEXT: v_accvgpr_write_b32 a4, v44 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_write_b32 a0, v40 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_write_b32 a1, v41 ; Reload Reuse @@ -10071,51 +10073,53 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) { ; GFX950-NEXT: s_waitcnt vmcnt(31) ; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX950-NEXT: s_waitcnt vmcnt(30) -; GFX950-NEXT: v_lshlrev_b32_e32 v30, 16, v4 +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v7 ; GFX950-NEXT: s_waitcnt vmcnt(29) -; GFX950-NEXT: v_lshlrev_b32_e32 v28, 16, v5 -; GFX950-NEXT: v_cvt_f64_f32_e32 v[4:5], v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; GFX950-NEXT: s_waitcnt vmcnt(28) +; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; GFX950-NEXT: s_waitcnt vmcnt(27) -; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v7 +; GFX950-NEXT: v_lshlrev_b32_e32 v27, 16, v9 ; GFX950-NEXT: s_waitcnt vmcnt(26) -; GFX950-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; GFX950-NEXT: v_lshlrev_b32_e32 v28, 16, v5 ; GFX950-NEXT: s_waitcnt vmcnt(25) -; GFX950-NEXT: v_lshlrev_b32_e32 v27, 16, v9 -; GFX950-NEXT: s_waitcnt vmcnt(24) ; GFX950-NEXT: v_lshlrev_b32_e32 v29, 16, v10 +; GFX950-NEXT: s_waitcnt vmcnt(24) +; GFX950-NEXT: v_lshlrev_b32_e32 v30, 16, v4 ; GFX950-NEXT: s_waitcnt vmcnt(23) ; GFX950-NEXT: v_lshlrev_b32_e32 v31, 16, v11 ; GFX950-NEXT: s_waitcnt vmcnt(22) -; GFX950-NEXT: v_lshlrev_b32_e32 v38, 16, v12 +; GFX950-NEXT: v_lshlrev_b32_e32 v32, 16, v15 ; GFX950-NEXT: s_waitcnt vmcnt(21) -; GFX950-NEXT: v_lshlrev_b32_e32 v36, 16, v13 +; GFX950-NEXT: v_lshlrev_b32_e32 v33, 16, v16 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[4:5], v1 ; GFX950-NEXT: s_waitcnt vmcnt(20) ; GFX950-NEXT: v_lshlrev_b32_e32 v34, 16, v14 -; GFX950-NEXT: s_waitcnt vmcnt(19) -; GFX950-NEXT: v_lshlrev_b32_e32 v32, 16, v15 ; GFX950-NEXT: s_waitcnt vmcnt(18) -; GFX950-NEXT: v_lshlrev_b32_e32 v33, 16, v16 -; GFX950-NEXT: v_cvt_f64_f32_e32 v[12:13], v27 -; GFX950-NEXT: s_waitcnt vmcnt(16) +; GFX950-NEXT: v_lshlrev_b32_e32 v36, 16, v13 +; GFX950-NEXT: s_waitcnt vmcnt(17) ; GFX950-NEXT: v_lshlrev_b32_e32 v37, 16, v18 +; GFX950-NEXT: s_waitcnt vmcnt(16) +; GFX950-NEXT: v_lshlrev_b32_e32 v38, 16, v12 ; GFX950-NEXT: s_waitcnt vmcnt(15) ; GFX950-NEXT: v_lshlrev_b32_e32 v39, 16, v19 -; GFX950-NEXT: s_waitcnt vmcnt(14) -; GFX950-NEXT: v_lshlrev_b32_e32 v44, 16, v20 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[12:13], v27 ; GFX950-NEXT: s_waitcnt vmcnt(13) -; GFX950-NEXT: v_lshlrev_b32_e32 v40, 16, v21 +; GFX950-NEXT: v_lshlrev_b32_e32 v49, 16, v24 ; GFX950-NEXT: v_cvt_f64_f32_e32 v[14:15], v30 -; GFX950-NEXT: v_cvt_f64_f32_e32 v[20:21], v31 +; GFX950-NEXT: s_waitcnt vmcnt(11) +; GFX950-NEXT: v_lshlrev_b32_e32 v53, 16, v25 ; GFX950-NEXT: s_waitcnt vmcnt(10) -; GFX950-NEXT: v_lshlrev_b32_e32 v49, 16, v24 +; GFX950-NEXT: v_lshlrev_b32_e32 v40, 16, v21 ; GFX950-NEXT: s_waitcnt vmcnt(9) -; GFX950-NEXT: v_lshlrev_b32_e32 v53, 16, v25 -; GFX950-NEXT: s_waitcnt vmcnt(8) ; GFX950-NEXT: v_lshlrev_b32_e32 v41, 16, v26 +; GFX950-NEXT: s_waitcnt vmcnt(8) +; GFX950-NEXT: v_lshlrev_b32_e32 v44, 16, v20 ; GFX950-NEXT: s_waitcnt vmcnt(7) ; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v42 ; GFX950-NEXT: s_waitcnt vmcnt(6) ; GFX950-NEXT: v_lshlrev_b32_e32 v42, 16, v43 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[20:21], v31 ; GFX950-NEXT: v_cvt_f64_f32_e32 v[18:19], v32 ; GFX950-NEXT: v_cvt_f64_f32_e32 v[24:25], v33 ; GFX950-NEXT: v_cvt_f64_f32_e32 v[26:27], v36 @@ -10127,10 +10131,11 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) { ; GFX950-NEXT: s_waitcnt vmcnt(5) ; GFX950-NEXT: v_lshlrev_b32_e32 v42, 16, v46 ; GFX950-NEXT: v_cvt_f64_f32_e32 v[42:43], v42 -; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_waitcnt vmcnt(2) ; GFX950-NEXT: v_lshlrev_b32_e32 v46, 16, v58 ; GFX950-NEXT: scratch_store_dwordx4 v0, v[42:45], off offset:240 ; GFX950-NEXT: v_cvt_f64_f32_e32 v[58:59], v46 +; GFX950-NEXT: s_waitcnt vmcnt(2) ; GFX950-NEXT: v_lshlrev_b32_e32 v46, 16, v47 ; GFX950-NEXT: v_cvt_f64_f32_e32 v[44:45], v1 ; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v56 @@ -10141,11 +10146,11 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) { ; GFX950-NEXT: v_lshlrev_b32_e32 v48, 16, v23 ; GFX950-NEXT: v_lshlrev_b32_e32 v52, 16, v22 ; GFX950-NEXT: scratch_store_dwordx4 v0, v[56:59], off offset:224 -; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; GFX950-NEXT: v_cvt_f64_f32_e32 v[10:11], v28 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[16:17], v29 ; GFX950-NEXT: v_cvt_f64_f32_e32 v[58:59], v1 +; GFX950-NEXT: s_waitcnt vmcnt(2) ; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v60 -; GFX950-NEXT: v_cvt_f64_f32_e32 v[16:17], v29 ; GFX950-NEXT: v_cvt_f64_f32_e32 v[22:23], v34 ; GFX950-NEXT: v_cvt_f64_f32_e32 v[28:29], v35 ; GFX950-NEXT: v_cvt_f64_f32_e32 v[34:35], v48 @@ -10386,105 +10391,105 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1f ; GFX11-NEXT: global_load_u16 v3, v[1:2], off offset:2 -; GFX11-NEXT: global_load_u16 v4, v[1:2], off offset:12 -; GFX11-NEXT: global_load_u16 v5, v[1:2], off offset:8 -; GFX11-NEXT: global_load_u16 v6, v[1:2], off offset:4 ; GFX11-NEXT: global_load_u16 v7, v[1:2], off ; GFX11-NEXT: global_load_u16 v8, v[1:2], off offset:6 +; GFX11-NEXT: global_load_u16 v6, v[1:2], off offset:4 ; GFX11-NEXT: global_load_u16 v9, v[1:2], off offset:10 +; GFX11-NEXT: global_load_u16 v5, v[1:2], off offset:8 ; GFX11-NEXT: global_load_u16 v10, v[1:2], off offset:14 +; GFX11-NEXT: global_load_u16 v4, v[1:2], off offset:12 ; GFX11-NEXT: global_load_u16 v11, v[1:2], off offset:18 -; GFX11-NEXT: global_load_u16 v12, v[1:2], off offset:28 -; GFX11-NEXT: global_load_u16 v13, v[1:2], off offset:24 -; GFX11-NEXT: global_load_u16 v14, v[1:2], off offset:20 ; GFX11-NEXT: global_load_u16 v15, v[1:2], off offset:16 ; GFX11-NEXT: global_load_u16 v16, v[1:2], off offset:22 +; GFX11-NEXT: global_load_u16 v14, v[1:2], off offset:20 ; GFX11-NEXT: global_load_u16 v17, v[1:2], off offset:26 +; GFX11-NEXT: global_load_u16 v13, v[1:2], off offset:24 ; GFX11-NEXT: global_load_u16 v18, v[1:2], off offset:30 +; GFX11-NEXT: global_load_u16 v12, v[1:2], off offset:28 ; GFX11-NEXT: global_load_u16 v19, v[1:2], off offset:34 -; GFX11-NEXT: global_load_u16 v20, v[1:2], off offset:44 -; GFX11-NEXT: global_load_u16 v21, v[1:2], off offset:40 -; GFX11-NEXT: global_load_u16 v22, v[1:2], off offset:36 ; GFX11-NEXT: global_load_u16 v23, v[1:2], off offset:32 ; GFX11-NEXT: global_load_u16 v24, v[1:2], off offset:38 +; GFX11-NEXT: global_load_u16 v22, v[1:2], off offset:36 ; GFX11-NEXT: global_load_u16 v25, v[1:2], off offset:42 +; GFX11-NEXT: global_load_u16 v21, v[1:2], off offset:40 ; GFX11-NEXT: global_load_u16 v26, v[1:2], off offset:46 +; GFX11-NEXT: global_load_u16 v20, v[1:2], off offset:44 ; GFX11-NEXT: global_load_u16 v27, v[1:2], off offset:50 -; GFX11-NEXT: global_load_u16 v28, v[1:2], off offset:60 -; GFX11-NEXT: global_load_u16 v29, v[1:2], off offset:56 -; GFX11-NEXT: global_load_u16 v30, v[1:2], off offset:52 ; GFX11-NEXT: global_load_u16 v31, v[1:2], off offset:48 ; GFX11-NEXT: global_load_u16 v32, v[1:2], off offset:54 +; GFX11-NEXT: global_load_u16 v30, v[1:2], off offset:52 ; GFX11-NEXT: global_load_u16 v33, v[1:2], off offset:58 +; GFX11-NEXT: global_load_u16 v29, v[1:2], off offset:56 +; GFX11-NEXT: global_load_u16 v28, v[1:2], off offset:60 ; GFX11-NEXT: global_load_u16 v1, v[1:2], off offset:62 ; GFX11-NEXT: s_waitcnt vmcnt(31) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; GFX11-NEXT: s_waitcnt vmcnt(30) -; GFX11-NEXT: v_lshlrev_b32_e32 v38, 16, v4 +; GFX11-NEXT: v_lshlrev_b32_e32 v37, 16, v7 ; GFX11-NEXT: s_waitcnt vmcnt(29) -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v8 ; GFX11-NEXT: s_waitcnt vmcnt(28) ; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; GFX11-NEXT: s_waitcnt vmcnt(27) -; GFX11-NEXT: v_lshlrev_b32_e32 v37, 16, v7 +; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; GFX11-NEXT: s_waitcnt vmcnt(26) -; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX11-NEXT: s_waitcnt vmcnt(25) -; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX11-NEXT: s_waitcnt vmcnt(24) ; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-NEXT: s_waitcnt vmcnt(24) +; GFX11-NEXT: v_lshlrev_b32_e32 v38, 16, v4 ; GFX11-NEXT: s_waitcnt vmcnt(23) ; GFX11-NEXT: v_lshlrev_b32_e32 v34, 16, v11 ; GFX11-NEXT: s_waitcnt vmcnt(22) -; GFX11-NEXT: v_lshlrev_b32_e32 v100, 16, v12 +; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v15 ; GFX11-NEXT: s_waitcnt vmcnt(21) -; GFX11-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX11-NEXT: v_lshlrev_b32_e32 v35, 16, v16 ; GFX11-NEXT: s_waitcnt vmcnt(20) ; GFX11-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; GFX11-NEXT: s_waitcnt vmcnt(19) -; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v15 +; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; GFX11-NEXT: s_waitcnt vmcnt(18) -; GFX11-NEXT: v_lshlrev_b32_e32 v35, 16, v16 +; GFX11-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; GFX11-NEXT: s_waitcnt vmcnt(17) -; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GFX11-NEXT: s_waitcnt vmcnt(16) ; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX11-NEXT: s_waitcnt vmcnt(16) +; GFX11-NEXT: v_lshlrev_b32_e32 v100, 16, v12 ; GFX11-NEXT: s_waitcnt vmcnt(15) ; GFX11-NEXT: v_lshlrev_b32_e32 v36, 16, v19 ; GFX11-NEXT: s_waitcnt vmcnt(14) -; GFX11-NEXT: v_lshlrev_b32_e32 v49, 16, v20 +; GFX11-NEXT: v_lshlrev_b32_e32 v101, 16, v23 ; GFX11-NEXT: s_waitcnt vmcnt(13) -; GFX11-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX11-NEXT: v_lshlrev_b32_e32 v48, 16, v24 ; GFX11-NEXT: s_waitcnt vmcnt(12) ; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; GFX11-NEXT: s_waitcnt vmcnt(11) -; GFX11-NEXT: v_lshlrev_b32_e32 v101, 16, v23 +; GFX11-NEXT: v_lshlrev_b32_e32 v25, 16, v25 ; GFX11-NEXT: s_waitcnt vmcnt(10) -; GFX11-NEXT: v_lshlrev_b32_e32 v48, 16, v24 +; GFX11-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; GFX11-NEXT: s_waitcnt vmcnt(9) -; GFX11-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GFX11-NEXT: s_waitcnt vmcnt(8) ; GFX11-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; GFX11-NEXT: s_waitcnt vmcnt(8) +; GFX11-NEXT: v_lshlrev_b32_e32 v49, 16, v20 ; GFX11-NEXT: s_waitcnt vmcnt(7) ; GFX11-NEXT: v_lshlrev_b32_e32 v52, 16, v27 ; GFX11-NEXT: s_waitcnt vmcnt(6) -; GFX11-NEXT: v_lshlrev_b32_e32 v65, 16, v28 +; GFX11-NEXT: v_lshlrev_b32_e32 v53, 16, v31 ; GFX11-NEXT: s_waitcnt vmcnt(5) -; GFX11-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GFX11-NEXT: v_lshlrev_b32_e32 v64, 16, v32 ; GFX11-NEXT: s_waitcnt vmcnt(4) ; GFX11-NEXT: v_lshlrev_b32_e32 v30, 16, v30 ; GFX11-NEXT: s_waitcnt vmcnt(3) -; GFX11-NEXT: v_lshlrev_b32_e32 v53, 16, v31 +; GFX11-NEXT: v_lshlrev_b32_e32 v33, 16, v33 ; GFX11-NEXT: s_waitcnt vmcnt(2) -; GFX11-NEXT: v_lshlrev_b32_e32 v64, 16, v32 +; GFX11-NEXT: v_lshlrev_b32_e32 v29, 16, v29 ; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; GFX11-NEXT: v_lshlrev_b32_e32 v65, 16, v28 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-NEXT: v_cvt_f64_f32_e32 v[96:97], v65 -; GFX11-NEXT: v_cvt_f64_f32_e32 v[84:85], v29 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[82:83], v64 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[86:87], v33 +; GFX11-NEXT: v_cvt_f64_f32_e32 v[84:85], v29 +; GFX11-NEXT: v_cvt_f64_f32_e32 v[96:97], v65 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[98:99], v1 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[80:81], v30 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[70:71], v52 @@ -10539,46 +10544,48 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) { ; GFX1250-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-NEXT: s_clause 0x1f ; GFX1250-NEXT: global_load_u16 v1, v[2:3], off offset:2 -; GFX1250-NEXT: global_load_u16 v4, v[2:3], off offset:12 -; GFX1250-NEXT: global_load_u16 v5, v[2:3], off offset:8 -; GFX1250-NEXT: global_load_u16 v6, v[2:3], off offset:4 ; GFX1250-NEXT: global_load_u16 v7, v[2:3], off ; GFX1250-NEXT: global_load_u16 v8, v[2:3], off offset:6 +; GFX1250-NEXT: global_load_u16 v6, v[2:3], off offset:4 ; GFX1250-NEXT: global_load_u16 v9, v[2:3], off offset:10 +; GFX1250-NEXT: global_load_u16 v5, v[2:3], off offset:8 ; GFX1250-NEXT: global_load_u16 v10, v[2:3], off offset:14 -; GFX1250-NEXT: global_load_u16 v11, v[2:3], off offset:18 +; GFX1250-NEXT: global_load_u16 v4, v[2:3], off offset:12 ; GFX1250-NEXT: global_load_u16 v12, v[2:3], off offset:62 ; GFX1250-NEXT: global_load_u16 v13, v[2:3], off offset:60 +; GFX1250-NEXT: global_load_u16 v11, v[2:3], off offset:18 ; GFX1250-NEXT: global_load_u16 v14, v[2:3], off offset:58 ; GFX1250-NEXT: global_load_u16 v15, v[2:3], off offset:56 -; GFX1250-NEXT: global_load_u16 v16, v[2:3], off offset:28 -; GFX1250-NEXT: global_load_u16 v17, v[2:3], off offset:24 -; GFX1250-NEXT: global_load_u16 v18, v[2:3], off offset:20 ; GFX1250-NEXT: global_load_u16 v19, v[2:3], off offset:16 ; GFX1250-NEXT: global_load_u16 v20, v[2:3], off offset:22 +; GFX1250-NEXT: global_load_u16 v18, v[2:3], off offset:20 ; GFX1250-NEXT: global_load_u16 v21, v[2:3], off offset:26 +; GFX1250-NEXT: global_load_u16 v17, v[2:3], off offset:24 ; GFX1250-NEXT: global_load_u16 v22, v[2:3], off offset:30 +; GFX1250-NEXT: global_load_u16 v16, v[2:3], off offset:28 ; GFX1250-NEXT: global_load_u16 v23, v[2:3], off offset:34 -; GFX1250-NEXT: global_load_u16 v24, v[2:3], off offset:44 -; GFX1250-NEXT: global_load_u16 v25, v[2:3], off offset:40 -; GFX1250-NEXT: global_load_u16 v26, v[2:3], off offset:36 ; GFX1250-NEXT: global_load_u16 v27, v[2:3], off offset:32 ; GFX1250-NEXT: global_load_u16 v28, v[2:3], off offset:38 +; GFX1250-NEXT: global_load_u16 v26, v[2:3], off offset:36 ; GFX1250-NEXT: global_load_u16 v29, v[2:3], off offset:42 +; GFX1250-NEXT: global_load_u16 v25, v[2:3], off offset:40 ; GFX1250-NEXT: global_load_u16 v30, v[2:3], off offset:46 +; GFX1250-NEXT: global_load_u16 v24, v[2:3], off offset:44 ; GFX1250-NEXT: global_load_u16 v31, v[2:3], off offset:50 -; GFX1250-NEXT: global_load_u16 v32, v[2:3], off offset:52 ; GFX1250-NEXT: global_load_u16 v33, v[2:3], off offset:48 ; GFX1250-NEXT: global_load_u16 v34, v[2:3], off offset:54 +; GFX1250-NEXT: global_load_u16 v32, v[2:3], off offset:52 ; GFX1250-NEXT: s_wait_loadcnt 0x1e -; GFX1250-NEXT: v_dual_lshlrev_b32 v1, 16, v1 :: v_dual_lshlrev_b32 v37, 16, v4 +; GFX1250-NEXT: v_dual_lshlrev_b32 v1, 16, v1 :: v_dual_lshlrev_b32 v84, 16, v7 ; GFX1250-NEXT: s_wait_loadcnt 0x1c -; GFX1250-NEXT: v_dual_lshlrev_b32 v81, 16, v5 :: v_dual_lshlrev_b32 v85, 16, v6 -; GFX1250-NEXT: s_wait_loadcnt 0x1a -; GFX1250-NEXT: v_dual_lshlrev_b32 v84, 16, v7 :: v_dual_lshlrev_b32 v35, 16, v8 +; GFX1250-NEXT: v_dual_lshlrev_b32 v35, 16, v8 :: v_dual_lshlrev_b32 v85, 16, v6 +; GFX1250-NEXT: s_wait_loadcnt 0x1b +; GFX1250-NEXT: v_lshlrev_b32_e32 v80, 16, v9 +; GFX1250-NEXT: s_wait_loadcnt 0x19 +; GFX1250-NEXT: v_dual_lshlrev_b32 v81, 16, v5 :: v_dual_lshlrev_b32 v36, 16, v10 ; GFX1250-NEXT: s_wait_loadcnt 0x18 -; GFX1250-NEXT: v_dual_lshlrev_b32 v80, 16, v9 :: v_dual_lshlrev_b32 v36, 16, v10 -; GFX1250-NEXT: s_wait_loadcnt 0x15 +; GFX1250-NEXT: v_lshlrev_b32_e32 v37, 16, v4 +; GFX1250-NEXT: s_wait_loadcnt 0x16 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: v_dual_lshlrev_b32 v2, 16, v12 :: v_dual_lshlrev_b32 v3, 16, v13 ; GFX1250-NEXT: s_wait_loadcnt 0x14 @@ -10588,27 +10595,29 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) { ; GFX1250-NEXT: v_cvt_f64_f32_e32 v[4:5], v2 ; GFX1250-NEXT: v_cvt_f64_f32_e32 v[2:3], v3 ; GFX1250-NEXT: s_wait_loadcnt 0x11 -; GFX1250-NEXT: v_dual_lshlrev_b32 v68, 16, v17 :: v_dual_lshlrev_b32 v39, 16, v16 -; GFX1250-NEXT: s_wait_loadcnt 0xe ; GFX1250-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; GFX1250-NEXT: v_cvt_f64_f32_e32 v[8:9], v6 ; GFX1250-NEXT: v_cvt_f64_f32_e32 v[6:7], v7 -; GFX1250-NEXT: s_wait_loadcnt 0xc +; GFX1250-NEXT: s_wait_loadcnt 0xd ; GFX1250-NEXT: v_dual_lshlrev_b32 v21, 16, v21 :: v_dual_lshlrev_b32 v38, 16, v22 +; GFX1250-NEXT: s_wait_loadcnt 0xc +; GFX1250-NEXT: v_dual_lshlrev_b32 v68, 16, v17 :: v_dual_lshlrev_b32 v39, 16, v16 ; GFX1250-NEXT: s_wait_loadcnt 0x9 -; GFX1250-NEXT: v_dual_lshlrev_b32 v48, 16, v23 :: v_dual_lshlrev_b32 v25, 16, v25 -; GFX1250-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GFX1250-NEXT: v_dual_lshlrev_b32 v48, 16, v23 :: v_dual_lshlrev_b32 v49, 16, v28 +; GFX1250-NEXT: s_wait_loadcnt 0x7 +; GFX1250-NEXT: v_lshlrev_b32_e32 v64, 16, v29 ; GFX1250-NEXT: s_wait_loadcnt 0x5 -; GFX1250-NEXT: v_dual_lshlrev_b32 v49, 16, v28 :: v_dual_lshlrev_b32 v64, 16, v29 +; GFX1250-NEXT: v_dual_lshlrev_b32 v25, 16, v25 :: v_dual_lshlrev_b32 v50, 16, v30 ; GFX1250-NEXT: s_wait_loadcnt 0x3 -; GFX1250-NEXT: v_dual_lshlrev_b32 v50, 16, v30 :: v_dual_lshlrev_b32 v51, 16, v31 -; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v24, 16, v24 :: v_dual_lshlrev_b32 v51, 16, v31 +; GFX1250-NEXT: s_wait_loadcnt 0x1 ; GFX1250-NEXT: v_dual_lshlrev_b32 v33, 16, v33 :: v_dual_lshlrev_b32 v52, 16, v34 +; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_dual_lshlrev_b32 v32, 16, v32 :: v_dual_lshlrev_b32 v69, 16, v27 ; GFX1250-NEXT: v_lshlrev_b32_e32 v70, 16, v26 ; GFX1250-NEXT: v_cvt_f64_f32_e32 v[14:15], v35 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1250-NEXT: v_cvt_f64_f32_e32 v[54:55], v52 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX1250-NEXT: v_cvt_f64_f32_e32 v[52:53], v32 ; GFX1250-NEXT: v_cvt_f64_f32_e32 v[30:31], v38 ; GFX1250-NEXT: v_cvt_f64_f32_e32 v[28:29], v39 @@ -48168,43 +48177,43 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x ; GCN-NEXT: v_and_b32_e32 v1, 1, v1 ; GCN-NEXT: v_and_b32_e32 v2, 1, v2 ; GCN-NEXT: v_and_b32_e32 v36, 1, v13 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:180 ; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:56 ; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:184 +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:52 +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:180 ; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:188 ; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:64 ; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:192 ; GCN-NEXT: v_and_b32_e32 v53, 1, v26 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:84 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:88 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:92 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:96 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:100 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:104 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:108 ; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:112 +; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:108 +; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:104 +; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:100 +; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:96 +; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:92 +; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:88 +; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:84 ; GCN-NEXT: v_and_b32_e32 v27, 1, v27 ; GCN-NEXT: v_and_b32_e32 v28, 1, v28 ; GCN-NEXT: v_and_b32_e32 v29, 1, v29 ; GCN-NEXT: v_and_b32_e32 v30, 1, v30 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:116 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:120 ; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:124 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 ; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:252 +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:120 ; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:248 +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:116 ; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:244 ; GCN-NEXT: s_waitcnt expcnt(6) ; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:240 +; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 ; GCN-NEXT: s_waitcnt vmcnt(14) ; GCN-NEXT: v_mul_f32_e32 v40, 1.0, v37 ; GCN-NEXT: v_mul_f32_e32 v38, 1.0, v38 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v36 -; GCN-NEXT: s_waitcnt vmcnt(5) +; GCN-NEXT: s_waitcnt vmcnt(7) ; GCN-NEXT: v_mul_f32_e32 v36, 1.0, v43 -; GCN-NEXT: s_waitcnt vmcnt(3) +; GCN-NEXT: s_waitcnt vmcnt(6) ; GCN-NEXT: v_mul_f32_e32 v37, 1.0, v44 ; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v30 ; GCN-NEXT: v_cndmask_b32_e64 v30, v37, v36, s[4:5] @@ -48222,14 +48231,16 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:212 ; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:128 +; GCN-NEXT: s_waitcnt vmcnt(13) ; GCN-NEXT: v_mul_f32_e32 v42, 1.0, v42 -; GCN-NEXT: s_waitcnt vmcnt(10) +; GCN-NEXT: s_waitcnt vmcnt(12) ; GCN-NEXT: v_mul_f32_e32 v43, 1.0, v45 +; GCN-NEXT: s_waitcnt vmcnt(11) ; GCN-NEXT: v_mul_f32_e32 v41, 1.0, v41 -; GCN-NEXT: s_waitcnt vmcnt(9) +; GCN-NEXT: s_waitcnt vmcnt(10) ; GCN-NEXT: v_mul_f32_e32 v44, 1.0, v46 ; GCN-NEXT: v_mul_f32_e32 v55, 1.0, v55 -; GCN-NEXT: s_waitcnt vmcnt(8) +; GCN-NEXT: s_waitcnt vmcnt(9) ; GCN-NEXT: v_mul_f32_e32 v45, 1.0, v47 ; GCN-NEXT: v_mul_f32_e32 v54, 1.0, v54 ; GCN-NEXT: s_waitcnt vmcnt(7) @@ -48279,14 +48290,14 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x ; GCN-NEXT: v_cndmask_b32_e64 v23, v56, v50, s[4:5] ; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v22 ; GCN-NEXT: v_cndmask_b32_e64 v22, v57, v49, s[4:5] -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:68 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:196 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:72 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:200 ; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:76 ; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:204 ; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:80 ; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:208 +; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:68 +; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:196 +; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:72 +; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:200 ; GCN-NEXT: v_and_b32_e32 v19, 1, v19 ; GCN-NEXT: v_and_b32_e32 v20, 1, v20 ; GCN-NEXT: v_and_b32_e32 v21, 1, v21 @@ -48295,13 +48306,13 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x ; GCN-NEXT: v_mul_f32_e32 v58, 1.0, v60 ; GCN-NEXT: v_mul_f32_e32 v39, 1.0, v39 ; GCN-NEXT: v_mul_f32_e32 v59, 1.0, v61 -; GCN-NEXT: s_waitcnt vmcnt(3) +; GCN-NEXT: s_waitcnt vmcnt(7) ; GCN-NEXT: v_mul_f32_e32 v46, 1.0, v46 -; GCN-NEXT: s_waitcnt vmcnt(2) +; GCN-NEXT: s_waitcnt vmcnt(6) ; GCN-NEXT: v_mul_f32_e32 v47, 1.0, v47 -; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: s_waitcnt vmcnt(5) ; GCN-NEXT: v_mul_f32_e32 v56, 1.0, v56 -; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_waitcnt vmcnt(4) ; GCN-NEXT: v_mul_f32_e32 v57, 1.0, v57 ; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v21 ; GCN-NEXT: v_cndmask_b32_e64 v21, v58, v48, s[4:5] @@ -48331,9 +48342,13 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x ; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33 ; GCN-NEXT: v_mul_f32_e32 v34, 1.0, v34 ; GCN-NEXT: v_mul_f32_e32 v35, 1.0, v35 +; GCN-NEXT: s_waitcnt vmcnt(11) ; GCN-NEXT: v_mul_f32_e32 v49, 1.0, v49 +; GCN-NEXT: s_waitcnt vmcnt(10) ; GCN-NEXT: v_mul_f32_e32 v50, 1.0, v50 +; GCN-NEXT: s_waitcnt vmcnt(9) ; GCN-NEXT: v_mul_f32_e32 v51, 1.0, v51 +; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: v_mul_f32_e32 v52, 1.0, v52 ; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v17 ; GCN-NEXT: v_cndmask_b32_e64 v17, v52, v51, s[4:5] @@ -48883,42 +48898,41 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[38:39], 1, v0 -; GFX8-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:68 -; GFX8-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 -; GFX8-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 -; GFX8-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 -; GFX8-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:76 -; GFX8-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:12 -; GFX8-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:80 -; GFX8-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:16 -; GFX8-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:84 -; GFX8-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:20 -; GFX8-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:88 -; GFX8-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:24 -; GFX8-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:92 -; GFX8-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:28 -; GFX8-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:96 -; GFX8-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:32 -; GFX8-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:100 -; GFX8-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:36 -; GFX8-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:104 -; GFX8-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:40 -; GFX8-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 -; GFX8-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:44 -; GFX8-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:112 -; GFX8-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:48 -; GFX8-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116 -; GFX8-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52 -; GFX8-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:120 -; GFX8-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:56 -; GFX8-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:124 -; GFX8-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:60 -; GFX8-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:128 ; GFX8-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:64 -; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v25 -; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:128 +; GFX8-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:60 +; GFX8-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:124 +; GFX8-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:56 +; GFX8-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:120 +; GFX8-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52 +; GFX8-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116 +; GFX8-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:48 +; GFX8-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:112 +; GFX8-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:44 +; GFX8-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 +; GFX8-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:40 +; GFX8-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:104 +; GFX8-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:36 +; GFX8-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:100 +; GFX8-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:32 +; GFX8-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:96 +; GFX8-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:28 +; GFX8-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:92 +; GFX8-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:24 +; GFX8-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:88 +; GFX8-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:20 +; GFX8-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:84 +; GFX8-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:16 +; GFX8-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:80 +; GFX8-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:12 +; GFX8-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:76 +; GFX8-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 +; GFX8-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 +; GFX8-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 +; GFX8-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:68 +; GFX8-NEXT: s_waitcnt vmcnt(14) ; GFX8-NEXT: v_lshrrev_b32_e32 v24, 16, v26 +; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v25 ; GFX8-NEXT: v_cndmask_b32_e64 v24, v33, v24, s[38:39] ; GFX8-NEXT: v_cndmask_b32_e64 v25, v25, v26, s[36:37] ; GFX8-NEXT: v_lshrrev_b32_e32 v26, 16, v28 @@ -48953,31 +48967,45 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x ; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v14 ; GFX8-NEXT: v_cndmask_b32_e64 v17, v33, v17, s[42:43] ; GFX8-NEXT: v_cndmask_b32_e64 v14, v14, v15, s[40:41] +; GFX8-NEXT: s_waitcnt vmcnt(13) ; GFX8-NEXT: v_lshrrev_b32_e32 v15, 16, v13 +; GFX8-NEXT: s_waitcnt vmcnt(12) ; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v12 ; GFX8-NEXT: v_cndmask_b32_e64 v15, v33, v15, s[28:29] ; GFX8-NEXT: v_cndmask_b32_e64 v12, v12, v13, s[26:27] +; GFX8-NEXT: s_waitcnt vmcnt(11) ; GFX8-NEXT: v_lshrrev_b32_e32 v13, 16, v11 +; GFX8-NEXT: s_waitcnt vmcnt(10) ; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v10 ; GFX8-NEXT: v_cndmask_b32_e64 v13, v33, v13, s[24:25] ; GFX8-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[22:23] +; GFX8-NEXT: s_waitcnt vmcnt(9) ; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v9 +; GFX8-NEXT: s_waitcnt vmcnt(8) ; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v8 ; GFX8-NEXT: v_cndmask_b32_e64 v11, v33, v11, s[20:21] ; GFX8-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[18:19] +; GFX8-NEXT: s_waitcnt vmcnt(7) ; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v7 +; GFX8-NEXT: s_waitcnt vmcnt(6) ; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v6 ; GFX8-NEXT: v_cndmask_b32_e64 v9, v33, v9, s[16:17] ; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[14:15] +; GFX8-NEXT: s_waitcnt vmcnt(5) ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v5 +; GFX8-NEXT: s_waitcnt vmcnt(4) ; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v4 ; GFX8-NEXT: v_cndmask_b32_e64 v7, v33, v7, s[12:13] ; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[10:11] +; GFX8-NEXT: s_waitcnt vmcnt(3) ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; GFX8-NEXT: s_waitcnt vmcnt(2) ; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v5, v33, v5, s[8:9] ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[6:7] +; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v0 ; GFX8-NEXT: v_cndmask_b32_e64 v3, v33, v3, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc @@ -49103,99 +49131,114 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x ; GFX900-NEXT: v_cmp_eq_u32_e64 s[30:31], 1, v0 ; GFX900-NEXT: v_and_b32_e32 v0, 1, v30 ; GFX900-NEXT: v_cmp_eq_u32_e64 s[34:35], 1, v0 -; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:68 -; GFX900-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 -; GFX900-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 -; GFX900-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 -; GFX900-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:76 -; GFX900-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:12 -; GFX900-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:80 -; GFX900-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:16 -; GFX900-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:84 -; GFX900-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:20 -; GFX900-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:88 -; GFX900-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:24 -; GFX900-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:92 -; GFX900-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:28 -; GFX900-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:96 -; GFX900-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:32 -; GFX900-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:100 -; GFX900-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:36 -; GFX900-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:104 -; GFX900-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:40 -; GFX900-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 -; GFX900-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:44 -; GFX900-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:112 -; GFX900-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:48 -; GFX900-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 -; GFX900-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:52 -; GFX900-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:120 -; GFX900-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:56 -; GFX900-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:124 -; GFX900-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:60 ; GFX900-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 ; GFX900-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64 -; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:124 +; GFX900-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:60 +; GFX900-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:120 +; GFX900-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:56 +; GFX900-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 +; GFX900-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:52 +; GFX900-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:112 +; GFX900-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:48 +; GFX900-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 +; GFX900-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:44 +; GFX900-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:104 +; GFX900-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:40 +; GFX900-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:100 +; GFX900-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:36 +; GFX900-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:96 +; GFX900-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:32 +; GFX900-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:92 +; GFX900-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:28 +; GFX900-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:88 +; GFX900-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:24 +; GFX900-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:84 +; GFX900-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:20 +; GFX900-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:80 +; GFX900-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:16 +; GFX900-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:76 +; GFX900-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:12 +; GFX900-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 +; GFX900-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 +; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:68 +; GFX900-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(30) ; GFX900-NEXT: v_cndmask_b32_e64 v30, v31, v32, s[34:35] ; GFX900-NEXT: v_lshrrev_b32_e32 v32, 16, v32 ; GFX900-NEXT: v_lshrrev_b32_e32 v31, 16, v31 ; GFX900-NEXT: v_cndmask_b32_e64 v31, v31, v32, s[30:31] +; GFX900-NEXT: s_waitcnt vmcnt(28) ; GFX900-NEXT: v_cndmask_b32_e64 v32, v28, v29, s[94:95] ; GFX900-NEXT: v_lshrrev_b32_e32 v29, 16, v29 ; GFX900-NEXT: v_lshrrev_b32_e32 v28, 16, v28 ; GFX900-NEXT: v_cndmask_b32_e64 v28, v28, v29, s[92:93] +; GFX900-NEXT: s_waitcnt vmcnt(26) ; GFX900-NEXT: v_cndmask_b32_e64 v29, v26, v27, s[90:91] ; GFX900-NEXT: v_lshrrev_b32_e32 v27, 16, v27 ; GFX900-NEXT: v_lshrrev_b32_e32 v26, 16, v26 ; GFX900-NEXT: v_cndmask_b32_e64 v26, v26, v27, s[88:89] +; GFX900-NEXT: s_waitcnt vmcnt(24) ; GFX900-NEXT: v_cndmask_b32_e64 v27, v24, v25, s[78:79] ; GFX900-NEXT: v_lshrrev_b32_e32 v25, 16, v25 ; GFX900-NEXT: v_lshrrev_b32_e32 v24, 16, v24 ; GFX900-NEXT: v_cndmask_b32_e64 v24, v24, v25, s[76:77] +; GFX900-NEXT: s_waitcnt vmcnt(22) ; GFX900-NEXT: v_cndmask_b32_e64 v25, v22, v23, s[74:75] ; GFX900-NEXT: v_lshrrev_b32_e32 v23, 16, v23 ; GFX900-NEXT: v_lshrrev_b32_e32 v22, 16, v22 ; GFX900-NEXT: v_cndmask_b32_e64 v22, v22, v23, s[72:73] +; GFX900-NEXT: s_waitcnt vmcnt(20) ; GFX900-NEXT: v_cndmask_b32_e64 v23, v20, v21, s[62:63] ; GFX900-NEXT: v_lshrrev_b32_e32 v21, 16, v21 ; GFX900-NEXT: v_lshrrev_b32_e32 v20, 16, v20 ; GFX900-NEXT: v_cndmask_b32_e64 v20, v20, v21, s[60:61] +; GFX900-NEXT: s_waitcnt vmcnt(18) ; GFX900-NEXT: v_cndmask_b32_e64 v21, v18, v19, s[58:59] ; GFX900-NEXT: v_lshrrev_b32_e32 v19, 16, v19 ; GFX900-NEXT: v_lshrrev_b32_e32 v18, 16, v18 ; GFX900-NEXT: v_cndmask_b32_e64 v18, v18, v19, s[56:57] +; GFX900-NEXT: s_waitcnt vmcnt(16) ; GFX900-NEXT: v_cndmask_b32_e64 v19, v16, v17, s[46:47] ; GFX900-NEXT: v_lshrrev_b32_e32 v17, 16, v17 ; GFX900-NEXT: v_lshrrev_b32_e32 v16, 16, v16 ; GFX900-NEXT: v_cndmask_b32_e64 v16, v16, v17, s[44:45] +; GFX900-NEXT: s_waitcnt vmcnt(14) ; GFX900-NEXT: v_cndmask_b32_e64 v17, v14, v15, s[42:43] ; GFX900-NEXT: v_lshrrev_b32_e32 v15, 16, v15 ; GFX900-NEXT: v_lshrrev_b32_e32 v14, 16, v14 ; GFX900-NEXT: v_cndmask_b32_e64 v14, v14, v15, s[40:41] +; GFX900-NEXT: s_waitcnt vmcnt(12) ; GFX900-NEXT: v_cndmask_b32_e64 v15, v12, v13, s[28:29] ; GFX900-NEXT: v_lshrrev_b32_e32 v13, 16, v13 ; GFX900-NEXT: v_lshrrev_b32_e32 v12, 16, v12 ; GFX900-NEXT: v_cndmask_b32_e64 v12, v12, v13, s[26:27] +; GFX900-NEXT: s_waitcnt vmcnt(10) ; GFX900-NEXT: v_cndmask_b32_e64 v13, v10, v11, s[24:25] ; GFX900-NEXT: v_lshrrev_b32_e32 v11, 16, v11 ; GFX900-NEXT: v_lshrrev_b32_e32 v10, 16, v10 ; GFX900-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[22:23] +; GFX900-NEXT: s_waitcnt vmcnt(8) ; GFX900-NEXT: v_cndmask_b32_e64 v11, v8, v9, s[20:21] ; GFX900-NEXT: v_lshrrev_b32_e32 v9, 16, v9 ; GFX900-NEXT: v_lshrrev_b32_e32 v8, 16, v8 ; GFX900-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[18:19] +; GFX900-NEXT: s_waitcnt vmcnt(6) ; GFX900-NEXT: v_cndmask_b32_e64 v9, v6, v7, s[16:17] ; GFX900-NEXT: v_lshrrev_b32_e32 v7, 16, v7 ; GFX900-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GFX900-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[14:15] +; GFX900-NEXT: s_waitcnt vmcnt(4) ; GFX900-NEXT: v_cndmask_b32_e64 v7, v4, v5, s[12:13] ; GFX900-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GFX900-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX900-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[10:11] +; GFX900-NEXT: s_waitcnt vmcnt(2) ; GFX900-NEXT: v_cndmask_b32_e64 v5, v2, v3, s[8:9] ; GFX900-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX900-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX900-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[6:7] +; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: v_cndmask_b32_e64 v3, v0, v1, s[4:5] ; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -49247,18 +49290,18 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x ; GFX950-NEXT: scratch_load_dword v39, off, s32 offset:52 ; GFX950-NEXT: scratch_load_dword v48, off, s32 offset:112 ; GFX950-NEXT: scratch_load_dword v49, off, s32 offset:48 -; GFX950-NEXT: scratch_load_dword v50, off, s32 offset:88 -; GFX950-NEXT: scratch_load_dword v51, off, s32 offset:24 -; GFX950-NEXT: scratch_load_dword v52, off, s32 offset:92 -; GFX950-NEXT: scratch_load_dword v53, off, s32 offset:28 ; GFX950-NEXT: scratch_load_dword v54, off, s32 offset:108 ; GFX950-NEXT: scratch_load_dword v55, off, s32 offset:44 -; GFX950-NEXT: scratch_load_dword v40, off, s32 offset:96 -; GFX950-NEXT: scratch_load_dword v41, off, s32 offset:32 -; GFX950-NEXT: scratch_load_dword v42, off, s32 offset:100 -; GFX950-NEXT: scratch_load_dword v43, off, s32 offset:36 ; GFX950-NEXT: scratch_load_dword v44, off, s32 offset:104 ; GFX950-NEXT: scratch_load_dword v45, off, s32 offset:40 +; GFX950-NEXT: scratch_load_dword v42, off, s32 offset:100 +; GFX950-NEXT: scratch_load_dword v43, off, s32 offset:36 +; GFX950-NEXT: scratch_load_dword v40, off, s32 offset:96 +; GFX950-NEXT: scratch_load_dword v41, off, s32 offset:32 +; GFX950-NEXT: scratch_load_dword v52, off, s32 offset:92 +; GFX950-NEXT: scratch_load_dword v53, off, s32 offset:28 +; GFX950-NEXT: scratch_load_dword v50, off, s32 offset:88 +; GFX950-NEXT: scratch_load_dword v51, off, s32 offset:24 ; GFX950-NEXT: v_and_b32_e32 v29, 1, v29 ; GFX950-NEXT: v_accvgpr_write_b32 a8, v56 ; Reload Reuse ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v29 @@ -49350,7 +49393,7 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e32 v23, v37, v36, vcc ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v20 -; GFX950-NEXT: s_waitcnt vmcnt(16) +; GFX950-NEXT: s_waitcnt vmcnt(20) ; GFX950-NEXT: v_lshrrev_b32_e32 v36, 16, v55 ; GFX950-NEXT: v_lshrrev_b32_e32 v37, 16, v54 ; GFX950-NEXT: v_cndmask_b32_e32 v20, v54, v55, vcc @@ -49358,7 +49401,7 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e32 v21, v37, v36, vcc ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v18 -; GFX950-NEXT: s_waitcnt vmcnt(10) +; GFX950-NEXT: s_waitcnt vmcnt(18) ; GFX950-NEXT: v_lshrrev_b32_e32 v36, 16, v45 ; GFX950-NEXT: v_lshrrev_b32_e32 v37, 16, v44 ; GFX950-NEXT: v_cndmask_b32_e32 v18, v44, v45, vcc @@ -49367,6 +49410,7 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x ; GFX950-NEXT: v_accvgpr_read_b32 v44, a4 ; Reload Reuse ; GFX950-NEXT: v_cndmask_b32_e32 v19, v37, v36, vcc ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v16 +; GFX950-NEXT: s_waitcnt vmcnt(16) ; GFX950-NEXT: v_lshrrev_b32_e32 v36, 16, v43 ; GFX950-NEXT: v_lshrrev_b32_e32 v37, 16, v42 ; GFX950-NEXT: v_cndmask_b32_e32 v16, v42, v43, vcc @@ -49375,6 +49419,7 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x ; GFX950-NEXT: v_accvgpr_read_b32 v42, a2 ; Reload Reuse ; GFX950-NEXT: v_cndmask_b32_e32 v17, v37, v36, vcc ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v14 +; GFX950-NEXT: s_waitcnt vmcnt(14) ; GFX950-NEXT: v_lshrrev_b32_e32 v36, 16, v41 ; GFX950-NEXT: v_lshrrev_b32_e32 v37, 16, v40 ; GFX950-NEXT: v_cndmask_b32_e32 v14, v40, v41, vcc @@ -49383,6 +49428,7 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x ; GFX950-NEXT: v_accvgpr_read_b32 v40, a0 ; Reload Reuse ; GFX950-NEXT: v_cndmask_b32_e32 v15, v37, v36, vcc ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v12 +; GFX950-NEXT: s_waitcnt vmcnt(12) ; GFX950-NEXT: v_lshrrev_b32_e32 v36, 16, v53 ; GFX950-NEXT: v_lshrrev_b32_e32 v37, 16, v52 ; GFX950-NEXT: v_cndmask_b32_e32 v12, v52, v53, vcc @@ -49390,6 +49436,7 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e32 v13, v37, v36, vcc ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v10 +; GFX950-NEXT: s_waitcnt vmcnt(10) ; GFX950-NEXT: v_lshrrev_b32_e32 v36, 16, v51 ; GFX950-NEXT: v_lshrrev_b32_e32 v37, 16, v50 ; GFX950-NEXT: v_cndmask_b32_e32 v10, v50, v51, vcc @@ -49530,99 +49577,114 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x ; GFX10-NEXT: v_and_b32_e32 v0, 1, v30 ; GFX10-NEXT: v_cmp_eq_u32_e64 s44, 1, v0 ; GFX10-NEXT: s_clause 0x1f -; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:68 -; GFX10-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 -; GFX10-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 -; GFX10-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 -; GFX10-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:76 -; GFX10-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:12 -; GFX10-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:80 -; GFX10-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:16 -; GFX10-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:84 -; GFX10-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:20 -; GFX10-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:88 -; GFX10-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:24 -; GFX10-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:92 -; GFX10-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:28 -; GFX10-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:96 -; GFX10-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:32 -; GFX10-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:100 -; GFX10-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:36 -; GFX10-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:104 -; GFX10-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:40 -; GFX10-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 -; GFX10-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:44 -; GFX10-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:112 -; GFX10-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:48 -; GFX10-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 -; GFX10-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:52 -; GFX10-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:120 -; GFX10-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:56 -; GFX10-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:124 -; GFX10-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:60 ; GFX10-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:128 ; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:64 -; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:124 +; GFX10-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:60 +; GFX10-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:120 +; GFX10-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:56 +; GFX10-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 +; GFX10-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:52 +; GFX10-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:112 +; GFX10-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:48 +; GFX10-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 +; GFX10-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:44 +; GFX10-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:104 +; GFX10-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:40 +; GFX10-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:100 +; GFX10-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:36 +; GFX10-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:96 +; GFX10-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:32 +; GFX10-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:92 +; GFX10-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:28 +; GFX10-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:88 +; GFX10-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:24 +; GFX10-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:84 +; GFX10-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:20 +; GFX10-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:80 +; GFX10-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:16 +; GFX10-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:76 +; GFX10-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:12 +; GFX10-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 +; GFX10-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 +; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:68 +; GFX10-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 +; GFX10-NEXT: s_waitcnt vmcnt(30) ; GFX10-NEXT: v_cndmask_b32_e64 v32, v30, v31, s44 ; GFX10-NEXT: v_lshrrev_b32_e32 v31, 16, v31 ; GFX10-NEXT: v_lshrrev_b32_e32 v30, 16, v30 ; GFX10-NEXT: v_cndmask_b32_e64 v30, v30, v31, s43 +; GFX10-NEXT: s_waitcnt vmcnt(28) ; GFX10-NEXT: v_cndmask_b32_e64 v31, v28, v29, s42 ; GFX10-NEXT: v_lshrrev_b32_e32 v29, 16, v29 ; GFX10-NEXT: v_lshrrev_b32_e32 v28, 16, v28 ; GFX10-NEXT: v_cndmask_b32_e64 v28, v28, v29, s41 +; GFX10-NEXT: s_waitcnt vmcnt(26) ; GFX10-NEXT: v_cndmask_b32_e64 v29, v26, v27, s40 ; GFX10-NEXT: v_lshrrev_b32_e32 v27, 16, v27 ; GFX10-NEXT: v_lshrrev_b32_e32 v26, 16, v26 ; GFX10-NEXT: v_cndmask_b32_e64 v26, v26, v27, s29 +; GFX10-NEXT: s_waitcnt vmcnt(24) ; GFX10-NEXT: v_cndmask_b32_e64 v27, v24, v25, s28 ; GFX10-NEXT: v_lshrrev_b32_e32 v25, 16, v25 ; GFX10-NEXT: v_lshrrev_b32_e32 v24, 16, v24 ; GFX10-NEXT: v_cndmask_b32_e64 v24, v24, v25, s27 +; GFX10-NEXT: s_waitcnt vmcnt(22) ; GFX10-NEXT: v_cndmask_b32_e64 v25, v22, v23, s26 ; GFX10-NEXT: v_lshrrev_b32_e32 v23, 16, v23 ; GFX10-NEXT: v_lshrrev_b32_e32 v22, 16, v22 ; GFX10-NEXT: v_cndmask_b32_e64 v22, v22, v23, s25 +; GFX10-NEXT: s_waitcnt vmcnt(20) ; GFX10-NEXT: v_cndmask_b32_e64 v23, v20, v21, s24 ; GFX10-NEXT: v_lshrrev_b32_e32 v21, 16, v21 ; GFX10-NEXT: v_lshrrev_b32_e32 v20, 16, v20 ; GFX10-NEXT: v_cndmask_b32_e64 v20, v20, v21, s23 +; GFX10-NEXT: s_waitcnt vmcnt(18) ; GFX10-NEXT: v_cndmask_b32_e64 v21, v18, v19, s22 ; GFX10-NEXT: v_lshrrev_b32_e32 v19, 16, v19 ; GFX10-NEXT: v_lshrrev_b32_e32 v18, 16, v18 ; GFX10-NEXT: v_cndmask_b32_e64 v18, v18, v19, s21 +; GFX10-NEXT: s_waitcnt vmcnt(16) ; GFX10-NEXT: v_cndmask_b32_e64 v19, v16, v17, s20 ; GFX10-NEXT: v_lshrrev_b32_e32 v17, 16, v17 ; GFX10-NEXT: v_lshrrev_b32_e32 v16, 16, v16 ; GFX10-NEXT: v_cndmask_b32_e64 v16, v16, v17, s19 +; GFX10-NEXT: s_waitcnt vmcnt(14) ; GFX10-NEXT: v_cndmask_b32_e64 v17, v14, v15, s18 ; GFX10-NEXT: v_lshrrev_b32_e32 v15, 16, v15 ; GFX10-NEXT: v_lshrrev_b32_e32 v14, 16, v14 ; GFX10-NEXT: v_cndmask_b32_e64 v14, v14, v15, s17 +; GFX10-NEXT: s_waitcnt vmcnt(12) ; GFX10-NEXT: v_cndmask_b32_e64 v15, v12, v13, s16 ; GFX10-NEXT: v_lshrrev_b32_e32 v13, 16, v13 ; GFX10-NEXT: v_lshrrev_b32_e32 v12, 16, v12 ; GFX10-NEXT: v_cndmask_b32_e64 v12, v12, v13, s15 +; GFX10-NEXT: s_waitcnt vmcnt(10) ; GFX10-NEXT: v_cndmask_b32_e64 v13, v10, v11, s14 ; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v11 ; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v10 ; GFX10-NEXT: v_cndmask_b32_e64 v10, v10, v11, s13 +; GFX10-NEXT: s_waitcnt vmcnt(8) ; GFX10-NEXT: v_cndmask_b32_e64 v11, v8, v9, s12 ; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v9 ; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v8 ; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v9, s11 +; GFX10-NEXT: s_waitcnt vmcnt(6) ; GFX10-NEXT: v_cndmask_b32_e64 v9, v6, v7, s10 ; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v7 ; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v7, s9 +; GFX10-NEXT: s_waitcnt vmcnt(4) ; GFX10-NEXT: v_cndmask_b32_e64 v7, v4, v5, s8 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v5, s7 +; GFX10-NEXT: s_waitcnt vmcnt(2) ; GFX10-NEXT: v_cndmask_b32_e64 v5, v2, v3, s6 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v3, s5 +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cndmask_b32_e64 v3, v0, v1, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -49650,17 +49712,17 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11TRUE16-NEXT: s_clause 0x1f ; GFX11TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 -; GFX11TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:68 -; GFX11TRUE16-NEXT: scratch_load_b32 v33, off, s32 offset:72 -; GFX11TRUE16-NEXT: scratch_load_b32 v34, off, s32 offset:76 -; GFX11TRUE16-NEXT: scratch_load_b32 v35, off, s32 offset:124 ; GFX11TRUE16-NEXT: scratch_load_b32 v36, off, s32 offset:128 ; GFX11TRUE16-NEXT: scratch_load_b32 v37, off, s32 offset:64 +; GFX11TRUE16-NEXT: scratch_load_b32 v35, off, s32 offset:124 ; GFX11TRUE16-NEXT: scratch_load_b32 v38, off, s32 offset:60 ; GFX11TRUE16-NEXT: scratch_load_b32 v39, off, s32 offset:120 ; GFX11TRUE16-NEXT: scratch_load_b32 v48, off, s32 offset:56 ; GFX11TRUE16-NEXT: scratch_load_b32 v49, off, s32 offset:116 ; GFX11TRUE16-NEXT: scratch_load_b32 v50, off, s32 offset:52 +; GFX11TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:68 +; GFX11TRUE16-NEXT: scratch_load_b32 v33, off, s32 offset:72 +; GFX11TRUE16-NEXT: scratch_load_b32 v34, off, s32 offset:76 ; GFX11TRUE16-NEXT: scratch_load_b32 v51, off, s32 offset:112 ; GFX11TRUE16-NEXT: scratch_load_b32 v52, off, s32 offset:48 ; GFX11TRUE16-NEXT: scratch_load_b32 v53, off, s32 offset:108 @@ -49746,15 +49808,15 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x ; GFX11TRUE16-NEXT: v_cmp_eq_u16_e64 s29, 1, v13.h ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(32) ; GFX11TRUE16-NEXT: v_and_b16 v0.h, 1, v31.l -; GFX11TRUE16-NEXT: s_waitcnt vmcnt(26) +; GFX11TRUE16-NEXT: s_waitcnt vmcnt(30) ; GFX11TRUE16-NEXT: v_cndmask_b16 v15.l, v36.l, v37.l, s26 -; GFX11TRUE16-NEXT: s_waitcnt vmcnt(25) +; GFX11TRUE16-NEXT: s_waitcnt vmcnt(28) ; GFX11TRUE16-NEXT: v_cndmask_b16 v14.l, v35.l, v38.l, s27 ; GFX11TRUE16-NEXT: v_cndmask_b16 v14.h, v35.h, v38.h, s28 -; GFX11TRUE16-NEXT: s_waitcnt vmcnt(23) +; GFX11TRUE16-NEXT: s_waitcnt vmcnt(26) ; GFX11TRUE16-NEXT: v_cndmask_b16 v13.l, v39.l, v48.l, s29 ; GFX11TRUE16-NEXT: v_cndmask_b16 v13.h, v39.h, v48.h, s25 -; GFX11TRUE16-NEXT: s_waitcnt vmcnt(21) +; GFX11TRUE16-NEXT: s_waitcnt vmcnt(24) ; GFX11TRUE16-NEXT: v_cndmask_b16 v12.l, v49.l, v50.l, s24 ; GFX11TRUE16-NEXT: v_cndmask_b16 v12.h, v49.h, v50.h, s23 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(19) @@ -50003,12 +50065,9 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x ; GFX1250TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX1250TRUE16-NEXT: s_clause 0x20 ; GFX1250TRUE16-NEXT: scratch_load_u16 v31, off, s32 -; GFX1250TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:68 -; GFX1250TRUE16-NEXT: scratch_load_b32 v33, off, s32 offset:72 -; GFX1250TRUE16-NEXT: scratch_load_b32 v34, off, s32 offset:76 -; GFX1250TRUE16-NEXT: scratch_load_b32 v35, off, s32 offset:124 ; GFX1250TRUE16-NEXT: scratch_load_b32 v36, off, s32 offset:128 ; GFX1250TRUE16-NEXT: scratch_load_b32 v37, off, s32 offset:64 +; GFX1250TRUE16-NEXT: scratch_load_b32 v35, off, s32 offset:124 ; GFX1250TRUE16-NEXT: scratch_load_b32 v38, off, s32 offset:60 ; GFX1250TRUE16-NEXT: scratch_load_b32 v39, off, s32 offset:120 ; GFX1250TRUE16-NEXT: scratch_load_b32 v48, off, s32 offset:56 @@ -50026,6 +50085,9 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x ; GFX1250TRUE16-NEXT: scratch_load_b32 v68, off, s32 offset:32 ; GFX1250TRUE16-NEXT: scratch_load_b32 v69, off, s32 offset:92 ; GFX1250TRUE16-NEXT: scratch_load_b32 v70, off, s32 offset:28 +; GFX1250TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:68 +; GFX1250TRUE16-NEXT: scratch_load_b32 v33, off, s32 offset:72 +; GFX1250TRUE16-NEXT: scratch_load_b32 v34, off, s32 offset:76 ; GFX1250TRUE16-NEXT: scratch_load_b32 v71, off, s32 offset:88 ; GFX1250TRUE16-NEXT: scratch_load_b32 v80, off, s32 offset:24 ; GFX1250TRUE16-NEXT: scratch_load_b32 v81, off, s32 offset:84 @@ -50099,33 +50161,33 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x ; GFX1250TRUE16-NEXT: v_cmp_eq_u16_e64 s29, 1, v11.h ; GFX1250TRUE16-NEXT: s_wait_loadcnt 0x20 ; GFX1250TRUE16-NEXT: v_and_b16 v0.h, 1, v31.l -; GFX1250TRUE16-NEXT: s_wait_loadcnt 0x1a +; GFX1250TRUE16-NEXT: s_wait_loadcnt 0x1e ; GFX1250TRUE16-NEXT: v_cndmask_b16 v15.l, v36.l, v37.l, s26 -; GFX1250TRUE16-NEXT: s_wait_loadcnt 0x19 +; GFX1250TRUE16-NEXT: s_wait_loadcnt 0x1c ; GFX1250TRUE16-NEXT: v_cndmask_b16 v14.l, v35.l, v38.l, s27 ; GFX1250TRUE16-NEXT: v_cndmask_b16 v14.h, v35.h, v38.h, s28 -; GFX1250TRUE16-NEXT: s_wait_loadcnt 0x17 +; GFX1250TRUE16-NEXT: s_wait_loadcnt 0x1a ; GFX1250TRUE16-NEXT: v_cndmask_b16 v13.l, v39.l, v48.l, s29 ; GFX1250TRUE16-NEXT: v_cndmask_b16 v13.h, v39.h, v48.h, s25 -; GFX1250TRUE16-NEXT: s_wait_loadcnt 0x15 +; GFX1250TRUE16-NEXT: s_wait_loadcnt 0x18 ; GFX1250TRUE16-NEXT: v_cndmask_b16 v12.l, v49.l, v50.l, s24 ; GFX1250TRUE16-NEXT: v_cndmask_b16 v12.h, v49.h, v50.h, s23 -; GFX1250TRUE16-NEXT: s_wait_loadcnt 0x13 +; GFX1250TRUE16-NEXT: s_wait_loadcnt 0x16 ; GFX1250TRUE16-NEXT: v_cndmask_b16 v11.l, v51.l, v52.l, s22 ; GFX1250TRUE16-NEXT: v_cndmask_b16 v11.h, v51.h, v52.h, s21 -; GFX1250TRUE16-NEXT: s_wait_loadcnt 0x11 +; GFX1250TRUE16-NEXT: s_wait_loadcnt 0x14 ; GFX1250TRUE16-NEXT: v_cndmask_b16 v10.l, v53.l, v54.l, s20 ; GFX1250TRUE16-NEXT: v_cndmask_b16 v10.h, v53.h, v54.h, s19 -; GFX1250TRUE16-NEXT: s_wait_loadcnt 0xf +; GFX1250TRUE16-NEXT: s_wait_loadcnt 0x12 ; GFX1250TRUE16-NEXT: v_cndmask_b16 v9.l, v55.l, v64.l, s18 ; GFX1250TRUE16-NEXT: v_cndmask_b16 v9.h, v55.h, v64.h, s17 -; GFX1250TRUE16-NEXT: s_wait_loadcnt 0xd +; GFX1250TRUE16-NEXT: s_wait_loadcnt 0x10 ; GFX1250TRUE16-NEXT: v_cndmask_b16 v8.l, v65.l, v66.l, s16 ; GFX1250TRUE16-NEXT: v_cndmask_b16 v8.h, v65.h, v66.h, s15 -; GFX1250TRUE16-NEXT: s_wait_loadcnt 0xb +; GFX1250TRUE16-NEXT: s_wait_loadcnt 0xe ; GFX1250TRUE16-NEXT: v_cndmask_b16 v7.l, v67.l, v68.l, s14 ; GFX1250TRUE16-NEXT: v_cndmask_b16 v7.h, v67.h, v68.h, s13 -; GFX1250TRUE16-NEXT: s_wait_loadcnt 0x9 +; GFX1250TRUE16-NEXT: s_wait_loadcnt 0xc ; GFX1250TRUE16-NEXT: v_cndmask_b16 v6.l, v69.l, v70.l, s12 ; GFX1250TRUE16-NEXT: v_cndmask_b16 v6.h, v69.h, v70.h, s11 ; GFX1250TRUE16-NEXT: s_wait_loadcnt 0x7 @@ -50172,15 +50234,15 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x ; GFX1250FAKE16-NEXT: scratch_load_b32 v53, off, s32 offset:40 ; GFX1250FAKE16-NEXT: scratch_load_b32 v54, off, s32 offset:100 ; GFX1250FAKE16-NEXT: scratch_load_b32 v55, off, s32 offset:36 -; GFX1250FAKE16-NEXT: scratch_load_b32 v64, off, s32 offset:76 -; GFX1250FAKE16-NEXT: scratch_load_b32 v65, off, s32 offset:12 ; GFX1250FAKE16-NEXT: scratch_load_b32 v66, off, s32 offset:96 ; GFX1250FAKE16-NEXT: scratch_load_b32 v67, off, s32 offset:32 -; GFX1250FAKE16-NEXT: scratch_load_b32 v68, off, s32 offset:80 -; GFX1250FAKE16-NEXT: scratch_load_b32 v69, off, s32 offset:84 ; GFX1250FAKE16-NEXT: scratch_load_b32 v70, off, s32 offset:92 ; GFX1250FAKE16-NEXT: scratch_load_b32 v71, off, s32 offset:28 +; GFX1250FAKE16-NEXT: scratch_load_b32 v69, off, s32 offset:84 ; GFX1250FAKE16-NEXT: scratch_load_b32 v80, off, s32 offset:20 +; GFX1250FAKE16-NEXT: scratch_load_b32 v68, off, s32 offset:80 +; GFX1250FAKE16-NEXT: scratch_load_b32 v64, off, s32 offset:76 +; GFX1250FAKE16-NEXT: scratch_load_b32 v65, off, s32 offset:12 ; GFX1250FAKE16-NEXT: scratch_load_b32 v81, off, s32 offset:88 ; GFX1250FAKE16-NEXT: scratch_load_b32 v82, off, s32 offset:24 ; GFX1250FAKE16-NEXT: v_and_b32_e32 v30, 1, v30 @@ -50251,11 +50313,11 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x ; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1250FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v14 ; GFX1250FAKE16-NEXT: v_dual_lshrrev_b32 v55, 16, v55 :: v_dual_lshrrev_b32 v54, 16, v54 -; GFX1250FAKE16-NEXT: s_wait_loadcnt 0xc +; GFX1250FAKE16-NEXT: s_wait_loadcnt 0xe ; GFX1250FAKE16-NEXT: v_cndmask_b32_e32 v14, v66, v67, vcc_lo ; GFX1250FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v12 ; GFX1250FAKE16-NEXT: v_dual_lshrrev_b32 v67, 16, v67 :: v_dual_lshrrev_b32 v66, 16, v66 -; GFX1250FAKE16-NEXT: s_wait_loadcnt 0x8 +; GFX1250FAKE16-NEXT: s_wait_loadcnt 0xc ; GFX1250FAKE16-NEXT: v_cndmask_b32_e32 v12, v70, v71, vcc_lo ; GFX1250FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v10 ; GFX1250FAKE16-NEXT: v_dual_lshrrev_b32 v70, 16, v70 :: v_dual_bitop2_b32 v25, 1, v25 bitop3:0x40 @@ -54700,12 +54762,12 @@ define <32 x bfloat> @v_fma_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b, <32 x bf ; GFX950-NEXT: scratch_load_dword v52, off, s32 offset:36 ; GFX950-NEXT: scratch_load_dword v53, off, s32 offset:32 ; GFX950-NEXT: scratch_load_dword v54, off, s32 offset:28 +; GFX950-NEXT: scratch_load_dword v55, off, s32 offset:24 +; GFX950-NEXT: scratch_load_dword v37, off, s32 offset:20 +; GFX950-NEXT: scratch_load_dword v34, off, s32 offset:16 ; GFX950-NEXT: scratch_load_dword v31, off, s32 offset:4 ; GFX950-NEXT: scratch_load_dword v32, off, s32 offset:8 ; GFX950-NEXT: scratch_load_dword v33, off, s32 offset:12 -; GFX950-NEXT: scratch_load_dword v34, off, s32 offset:16 -; GFX950-NEXT: scratch_load_dword v37, off, s32 offset:20 -; GFX950-NEXT: scratch_load_dword v55, off, s32 offset:24 ; GFX950-NEXT: v_accvgpr_write_b32 a3, v43 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_write_b32 a5, v45 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_write_b32 a6, v46 ; Reload Reuse @@ -54800,7 +54862,7 @@ define <32 x bfloat> @v_fma_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b, <32 x bf ; GFX950-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; GFX950-NEXT: v_fmac_f32_e32 v23, v6, v22 -; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_waitcnt vmcnt(5) ; GFX950-NEXT: v_and_b32_e32 v6, 0xffff0000, v55 ; GFX950-NEXT: v_and_b32_e32 v22, 0xffff0000, v21 ; GFX950-NEXT: v_and_b32_e32 v39, 0xffff0000, v5 @@ -54809,6 +54871,7 @@ define <32 x bfloat> @v_fma_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b, <32 x bf ; GFX950-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX950-NEXT: v_fmac_f32_e32 v22, v5, v21 +; GFX950-NEXT: s_waitcnt vmcnt(4) ; GFX950-NEXT: v_and_b32_e32 v5, 0xffff0000, v37 ; GFX950-NEXT: v_and_b32_e32 v21, 0xffff0000, v20 ; GFX950-NEXT: v_and_b32_e32 v39, 0xffff0000, v4 @@ -54817,6 +54880,7 @@ define <32 x bfloat> @v_fma_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b, <32 x bf ; GFX950-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX950-NEXT: v_fmac_f32_e32 v21, v4, v20 +; GFX950-NEXT: s_waitcnt vmcnt(3) ; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v34 ; GFX950-NEXT: v_and_b32_e32 v20, 0xffff0000, v19 ; GFX950-NEXT: v_and_b32_e32 v37, 0xffff0000, v3 @@ -54825,6 +54889,7 @@ define <32 x bfloat> @v_fma_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b, <32 x bf ; GFX950-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX950-NEXT: v_fmac_f32_e32 v20, v3, v19 +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v33 ; GFX950-NEXT: v_and_b32_e32 v19, 0xffff0000, v18 ; GFX950-NEXT: v_and_b32_e32 v34, 0xffff0000, v2 @@ -55903,7 +55968,6 @@ define <32 x bfloat> @v_fma_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b, <32 x bf ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_clause 0x10 -; GFX1250-NEXT: scratch_load_b32 v31, off, s32 offset:64 ; GFX1250-NEXT: scratch_load_b32 v32, off, s32 offset:4 ; GFX1250-NEXT: scratch_load_b32 v33, off, s32 offset:8 ; GFX1250-NEXT: scratch_load_b32 v34, off, s32 offset:12 @@ -55919,36 +55983,37 @@ define <32 x bfloat> @v_fma_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b, <32 x bf ; GFX1250-NEXT: scratch_load_b32 v52, off, s32 offset:52 ; GFX1250-NEXT: scratch_load_b32 v53, off, s32 offset:56 ; GFX1250-NEXT: scratch_load_b32 v54, off, s32 offset:60 +; GFX1250-NEXT: scratch_load_b32 v31, off, s32 offset:64 ; GFX1250-NEXT: scratch_load_b32 v55, off, s32 -; GFX1250-NEXT: s_wait_loadcnt 0xf +; GFX1250-NEXT: s_wait_loadcnt 0x10 ; GFX1250-NEXT: v_pk_fma_bf16 v0, v0, v16, v32 -; GFX1250-NEXT: s_wait_loadcnt 0xe +; GFX1250-NEXT: s_wait_loadcnt 0xf ; GFX1250-NEXT: v_pk_fma_bf16 v1, v1, v17, v33 -; GFX1250-NEXT: s_wait_loadcnt 0xd +; GFX1250-NEXT: s_wait_loadcnt 0xe ; GFX1250-NEXT: v_pk_fma_bf16 v2, v2, v18, v34 -; GFX1250-NEXT: s_wait_loadcnt 0xc +; GFX1250-NEXT: s_wait_loadcnt 0xd ; GFX1250-NEXT: v_pk_fma_bf16 v3, v3, v19, v35 -; GFX1250-NEXT: s_wait_loadcnt 0xb +; GFX1250-NEXT: s_wait_loadcnt 0xc ; GFX1250-NEXT: v_pk_fma_bf16 v4, v4, v20, v36 -; GFX1250-NEXT: s_wait_loadcnt 0xa +; GFX1250-NEXT: s_wait_loadcnt 0xb ; GFX1250-NEXT: v_pk_fma_bf16 v5, v5, v21, v37 -; GFX1250-NEXT: s_wait_loadcnt 0x9 +; GFX1250-NEXT: s_wait_loadcnt 0xa ; GFX1250-NEXT: v_pk_fma_bf16 v6, v6, v22, v38 -; GFX1250-NEXT: s_wait_loadcnt 0x8 +; GFX1250-NEXT: s_wait_loadcnt 0x9 ; GFX1250-NEXT: v_pk_fma_bf16 v7, v7, v23, v39 -; GFX1250-NEXT: s_wait_loadcnt 0x7 +; GFX1250-NEXT: s_wait_loadcnt 0x8 ; GFX1250-NEXT: v_pk_fma_bf16 v8, v8, v24, v48 -; GFX1250-NEXT: s_wait_loadcnt 0x6 +; GFX1250-NEXT: s_wait_loadcnt 0x7 ; GFX1250-NEXT: v_pk_fma_bf16 v9, v9, v25, v49 -; GFX1250-NEXT: s_wait_loadcnt 0x5 +; GFX1250-NEXT: s_wait_loadcnt 0x6 ; GFX1250-NEXT: v_pk_fma_bf16 v10, v10, v26, v50 -; GFX1250-NEXT: s_wait_loadcnt 0x4 +; GFX1250-NEXT: s_wait_loadcnt 0x5 ; GFX1250-NEXT: v_pk_fma_bf16 v11, v11, v27, v51 -; GFX1250-NEXT: s_wait_loadcnt 0x3 +; GFX1250-NEXT: s_wait_loadcnt 0x4 ; GFX1250-NEXT: v_pk_fma_bf16 v12, v12, v28, v52 -; GFX1250-NEXT: s_wait_loadcnt 0x2 +; GFX1250-NEXT: s_wait_loadcnt 0x3 ; GFX1250-NEXT: v_pk_fma_bf16 v13, v13, v29, v53 -; GFX1250-NEXT: s_wait_loadcnt 0x1 +; GFX1250-NEXT: s_wait_loadcnt 0x2 ; GFX1250-NEXT: v_pk_fma_bf16 v14, v14, v30, v54 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_pk_fma_bf16 v15, v15, v55, v31 diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll index 68313807c427f..6c25b36ed5e10 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll @@ -259,14 +259,14 @@ define amdgpu_kernel void @memcpy_known(ptr addrspace(7) %src, ptr addrspace(7) ; SDAG-GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; SDAG-GFX942-NEXT: s_add_i32 s1, s0, s16 ; SDAG-GFX942-NEXT: v_mov_b32_e32 v60, s1 +; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[12:15], v60, s[4:7], 0 offen offset:32 ; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[8:11], v60, s[4:7], 0 offen ; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[4:7], v60, s[4:7], 0 offen offset:16 -; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[12:15], v60, s[4:7], 0 offen offset:32 ; SDAG-GFX942-NEXT: s_add_i32 s2, s8, s16 ; SDAG-GFX942-NEXT: v_mov_b32_e32 v0, s2 ; SDAG-GFX942-NEXT: s_addk_i32 s16, 0x100 ; SDAG-GFX942-NEXT: s_cmpk_lt_u32 s16, 0x2000 -; SDAG-GFX942-NEXT: s_waitcnt vmcnt(0) +; SDAG-GFX942-NEXT: s_waitcnt vmcnt(2) ; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a0, v15 ; Reload Reuse ; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a1, v14 ; Reload Reuse ; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a2, v13 ; Reload Reuse @@ -285,8 +285,9 @@ define amdgpu_kernel void @memcpy_known(ptr addrspace(7) %src, ptr addrspace(7) ; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[56:59], v60, s[4:7], 0 offen offset:224 ; SDAG-GFX942-NEXT: s_nop 0 ; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[60:63], v60, s[4:7], 0 offen offset:240 -; SDAG-GFX942-NEXT: s_nop 0 +; SDAG-GFX942-NEXT: s_waitcnt vmcnt(14) ; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[8:11], v0, s[12:15], 0 offen +; SDAG-GFX942-NEXT: s_waitcnt vmcnt(14) ; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[4:7], v0, s[12:15], 0 offen offset:16 ; SDAG-GFX942-NEXT: s_nop 1 ; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v5, a0 ; Reload Reuse @@ -431,6 +432,7 @@ define amdgpu_kernel void @memcpy_known(ptr addrspace(7) %src, ptr addrspace(7) ; GISEL-GFX942-NEXT: .LBB0_1: ; %load-store-loop ; GISEL-GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GISEL-GFX942-NEXT: v_add_u32_e32 v62, s0, v1 +; GISEL-GFX942-NEXT: buffer_load_dwordx4 a[0:3], v62, s[8:11], 0 offen offset:240 ; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[2:5], v62, s[8:11], 0 offen ; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[6:9], v62, s[8:11], 0 offen offset:16 ; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[10:13], v62, s[8:11], 0 offen offset:32 @@ -446,26 +448,40 @@ define amdgpu_kernel void @memcpy_known(ptr addrspace(7) %src, ptr addrspace(7) ; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[50:53], v62, s[8:11], 0 offen offset:192 ; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[54:57], v62, s[8:11], 0 offen offset:208 ; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[58:61], v62, s[8:11], 0 offen offset:224 -; GISEL-GFX942-NEXT: buffer_load_dwordx4 a[0:3], v62, s[8:11], 0 offen offset:240 ; GISEL-GFX942-NEXT: v_add_u32_e32 v63, s12, v1 ; GISEL-GFX942-NEXT: v_add_u32_e32 v1, 0x100, v1 ; GISEL-GFX942-NEXT: v_cmp_lt_u32_e32 vcc, v1, v0 -; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0) +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) ; GISEL-GFX942-NEXT: scratch_store_dwordx4 off, a[0:3], off ; 16-byte Folded Spill +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v63, s[4:7], 0 offen +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[6:9], v63, s[4:7], 0 offen offset:16 +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[10:13], v63, s[4:7], 0 offen offset:32 +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[14:17], v63, s[4:7], 0 offen offset:48 +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[18:21], v63, s[4:7], 0 offen offset:64 +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[22:25], v63, s[4:7], 0 offen offset:80 +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[26:29], v63, s[4:7], 0 offen offset:96 +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[30:33], v63, s[4:7], 0 offen offset:112 +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[34:37], v63, s[4:7], 0 offen offset:128 +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[38:41], v63, s[4:7], 0 offen offset:144 +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[42:45], v63, s[4:7], 0 offen offset:160 +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[46:49], v63, s[4:7], 0 offen offset:176 +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[50:53], v63, s[4:7], 0 offen offset:192 +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[54:57], v63, s[4:7], 0 offen offset:208 +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[58:61], v63, s[4:7], 0 offen offset:224 ; GISEL-GFX942-NEXT: scratch_load_dwordx4 v[2:5], off, off ; 16-byte Folded Reload ; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0) @@ -785,14 +801,14 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp ; SDAG-GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; SDAG-GFX942-NEXT: s_add_i32 s1, s0, s16 ; SDAG-GFX942-NEXT: v_mov_b32_e32 v60, s1 +; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[12:15], v60, s[4:7], 0 offen offset:32 ; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[8:11], v60, s[4:7], 0 offen ; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[4:7], v60, s[4:7], 0 offen offset:16 -; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[12:15], v60, s[4:7], 0 offen offset:32 ; SDAG-GFX942-NEXT: s_add_i32 s2, s8, s16 ; SDAG-GFX942-NEXT: v_mov_b32_e32 v0, s2 ; SDAG-GFX942-NEXT: s_addk_i32 s16, 0x100 ; SDAG-GFX942-NEXT: s_cmpk_lt_u32 s16, 0x100 -; SDAG-GFX942-NEXT: s_waitcnt vmcnt(0) +; SDAG-GFX942-NEXT: s_waitcnt vmcnt(2) ; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a0, v15 ; Reload Reuse ; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a1, v14 ; Reload Reuse ; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a2, v13 ; Reload Reuse @@ -811,8 +827,9 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp ; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[56:59], v60, s[4:7], 0 offen offset:224 ; SDAG-GFX942-NEXT: s_nop 0 ; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[60:63], v60, s[4:7], 0 offen offset:240 -; SDAG-GFX942-NEXT: s_nop 0 +; SDAG-GFX942-NEXT: s_waitcnt vmcnt(14) ; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[8:11], v0, s[12:15], 0 offen +; SDAG-GFX942-NEXT: s_waitcnt vmcnt(14) ; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[4:7], v0, s[12:15], 0 offen offset:16 ; SDAG-GFX942-NEXT: s_nop 1 ; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v5, a0 ; Reload Reuse @@ -957,6 +974,7 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp ; GISEL-GFX942-NEXT: .LBB1_1: ; %load-store-loop ; GISEL-GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GISEL-GFX942-NEXT: v_add_u32_e32 v62, s0, v1 +; GISEL-GFX942-NEXT: buffer_load_dwordx4 a[0:3], v62, s[8:11], 0 offen offset:240 ; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[2:5], v62, s[8:11], 0 offen ; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[6:9], v62, s[8:11], 0 offen offset:16 ; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[10:13], v62, s[8:11], 0 offen offset:32 @@ -972,26 +990,40 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp ; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[50:53], v62, s[8:11], 0 offen offset:192 ; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[54:57], v62, s[8:11], 0 offen offset:208 ; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[58:61], v62, s[8:11], 0 offen offset:224 -; GISEL-GFX942-NEXT: buffer_load_dwordx4 a[0:3], v62, s[8:11], 0 offen offset:240 ; GISEL-GFX942-NEXT: v_add_u32_e32 v63, s12, v1 ; GISEL-GFX942-NEXT: v_add_u32_e32 v1, 0x100, v1 ; GISEL-GFX942-NEXT: v_cmp_lt_u32_e32 vcc, v1, v0 -; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0) +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) ; GISEL-GFX942-NEXT: scratch_store_dwordx4 off, a[0:3], off ; 16-byte Folded Spill +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v63, s[4:7], 0 offen +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[6:9], v63, s[4:7], 0 offen offset:16 +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[10:13], v63, s[4:7], 0 offen offset:32 +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[14:17], v63, s[4:7], 0 offen offset:48 +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[18:21], v63, s[4:7], 0 offen offset:64 +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[22:25], v63, s[4:7], 0 offen offset:80 +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[26:29], v63, s[4:7], 0 offen offset:96 +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[30:33], v63, s[4:7], 0 offen offset:112 +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[34:37], v63, s[4:7], 0 offen offset:128 +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[38:41], v63, s[4:7], 0 offen offset:144 +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[42:45], v63, s[4:7], 0 offen offset:160 +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[46:49], v63, s[4:7], 0 offen offset:176 +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[50:53], v63, s[4:7], 0 offen offset:192 +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[54:57], v63, s[4:7], 0 offen offset:208 +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[58:61], v63, s[4:7], 0 offen offset:224 ; GISEL-GFX942-NEXT: scratch_load_dwordx4 v[2:5], off, off ; 16-byte Folded Reload ; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll index 2b63a8cf69476..ec72ab930ddc0 100644 --- a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll +++ b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll @@ -726,9 +726,9 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) { ; GCN-O0-NEXT: s_waitcnt expcnt(0) ; GCN-O0-NEXT: buffer_load_dword v6, off, s[12:15], 0 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] -; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload ; GCN-O0-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload ; GCN-O0-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b32 s0, 0xf000 ; GCN-O0-NEXT: s_mov_b32 s2, 0 ; GCN-O0-NEXT: s_mov_b32 s4, s2 @@ -738,9 +738,10 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) { ; GCN-O0-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3 ; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5] ; GCN-O0-NEXT: v_mov_b32_e32 v1, 1 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: s_waitcnt vmcnt(1) ; GCN-O0-NEXT: buffer_store_dword v1, v[2:3], s[0:3], 0 addr64 offset:4 ; GCN-O0-NEXT: s_mov_b32 s0, 2 +; GCN-O0-NEXT: s_waitcnt vmcnt(1) ; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s0 ; GCN-O0-NEXT: s_mov_b64 s[0:1], exec ; GCN-O0-NEXT: v_writelane_b32 v6, s0, 4 @@ -772,9 +773,9 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) { ; GCN-O0-NEXT: s_waitcnt expcnt(0) ; GCN-O0-NEXT: buffer_load_dword v6, off, s[12:15], 0 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] -; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload ; GCN-O0-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload ; GCN-O0-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b32 s1, 0xf000 ; GCN-O0-NEXT: s_mov_b32 s0, 0 ; GCN-O0-NEXT: s_mov_b32 s2, s0 @@ -784,8 +785,9 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) { ; GCN-O0-NEXT: ; kill: def $sgpr4_sgpr5 killed $sgpr4_sgpr5 def $sgpr4_sgpr5_sgpr6_sgpr7 ; GCN-O0-NEXT: s_mov_b64 s[6:7], s[2:3] ; GCN-O0-NEXT: v_mov_b32_e32 v1, 3 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: s_waitcnt vmcnt(1) ; GCN-O0-NEXT: buffer_store_dword v1, v[2:3], s[4:7], 0 addr64 offset:12 +; GCN-O0-NEXT: s_waitcnt vmcnt(1) ; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s0 ; GCN-O0-NEXT: s_mov_b64 s[0:1], exec ; GCN-O0-NEXT: v_writelane_b32 v6, s0, 6 @@ -1312,15 +1314,12 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 { ; GCN-O0-NEXT: v_readlane_b32 s5, v7, 19 ; GCN-O0-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN-O0-NEXT: ; %bb.11: ; %bb12 -; GCN-O0-NEXT: s_waitcnt expcnt(3) -; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt expcnt(2) -; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt expcnt(1) -; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_waitcnt expcnt(0) ; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(3) ; GCN-O0-NEXT: v_mov_b32_e32 v4, v3 ; GCN-O0-NEXT: ; implicit-def: $sgpr4 ; GCN-O0-NEXT: v_mov_b32_e32 v5, s4 diff --git a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll index 07e6a76d14cf9..d38dbd9ba1f93 100644 --- a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll +++ b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll @@ -909,27 +909,31 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s8, s6 ; SI-NEXT: s_mov_b32 s9, s7 -; SI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 -; SI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:1 -; SI-NEXT: buffer_load_ubyte v2, off, s[8:11], 0 offset:2 -; SI-NEXT: buffer_load_ubyte v3, off, s[8:11], 0 offset:3 ; SI-NEXT: buffer_load_ubyte v4, off, s[8:11], 0 offset:5 ; SI-NEXT: buffer_load_ubyte v5, off, s[8:11], 0 offset:7 +; SI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:1 +; SI-NEXT: buffer_load_ubyte v3, off, s[8:11], 0 offset:3 ; SI-NEXT: buffer_load_ubyte v6, off, s[8:11], 0 offset:4 ; SI-NEXT: buffer_load_ubyte v7, off, s[8:11], 0 offset:6 +; SI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 +; SI-NEXT: buffer_load_ubyte v2, off, s[8:11], 0 offset:2 ; SI-NEXT: s_mov_b32 s0, s4 ; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_or_b32_e32 v4, v4, v6 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_or_b32_e32 v5, v5, v7 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v1, v3, v2 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v5 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -1055,8 +1059,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_ubyte v0, v1, s[2:3] ; GFX9-GISEL-NEXT: global_load_ubyte v2, v1, s[2:3] offset:1 -; GFX9-GISEL-NEXT: global_load_ubyte v3, v1, s[2:3] offset:2 ; GFX9-GISEL-NEXT: global_load_ubyte v4, v1, s[2:3] offset:3 +; GFX9-GISEL-NEXT: global_load_ubyte v3, v1, s[2:3] offset:2 ; GFX9-GISEL-NEXT: global_load_ubyte v5, v1, s[2:3] offset:4 ; GFX9-GISEL-NEXT: global_load_ubyte v6, v1, s[2:3] offset:5 ; GFX9-GISEL-NEXT: global_load_ubyte v7, v1, s[2:3] offset:7 @@ -1064,9 +1068,9 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(6) ; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, v2, 8, v0 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(5) -; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX9-GISEL-NEXT: s_waitcnt vmcnt(4) ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v2, 24, v4 +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(4) +; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX9-GISEL-NEXT: v_or3_b32 v2, v2, v3, v0 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(2) ; GFX9-GISEL-NEXT: v_lshl_or_b32 v4, v6, 8, v5 diff --git a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll index 137acd34ecc2a..5ef2a5b9df344 100644 --- a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll +++ b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll @@ -874,27 +874,31 @@ define amdgpu_kernel void @v_cttz_zero_undef_i64_with_select(ptr addrspace(1) no ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s8, s6 ; SI-NEXT: s_mov_b32 s9, s7 -; SI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 -; SI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:1 -; SI-NEXT: buffer_load_ubyte v2, off, s[8:11], 0 offset:2 -; SI-NEXT: buffer_load_ubyte v3, off, s[8:11], 0 offset:3 ; SI-NEXT: buffer_load_ubyte v4, off, s[8:11], 0 offset:5 ; SI-NEXT: buffer_load_ubyte v5, off, s[8:11], 0 offset:7 +; SI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:1 +; SI-NEXT: buffer_load_ubyte v3, off, s[8:11], 0 offset:3 ; SI-NEXT: buffer_load_ubyte v6, off, s[8:11], 0 offset:4 ; SI-NEXT: buffer_load_ubyte v7, off, s[8:11], 0 offset:6 +; SI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 +; SI-NEXT: buffer_load_ubyte v2, off, s[8:11], 0 offset:2 ; SI-NEXT: s_mov_b32 s0, s4 ; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_or_b32_e32 v4, v4, v6 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_or_b32_e32 v5, v5, v7 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v1, v3, v2 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v5 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -1020,8 +1024,8 @@ define amdgpu_kernel void @v_cttz_zero_undef_i64_with_select(ptr addrspace(1) no ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_ubyte v0, v1, s[2:3] ; GFX9-GISEL-NEXT: global_load_ubyte v2, v1, s[2:3] offset:1 -; GFX9-GISEL-NEXT: global_load_ubyte v3, v1, s[2:3] offset:2 ; GFX9-GISEL-NEXT: global_load_ubyte v4, v1, s[2:3] offset:3 +; GFX9-GISEL-NEXT: global_load_ubyte v3, v1, s[2:3] offset:2 ; GFX9-GISEL-NEXT: global_load_ubyte v5, v1, s[2:3] offset:4 ; GFX9-GISEL-NEXT: global_load_ubyte v6, v1, s[2:3] offset:5 ; GFX9-GISEL-NEXT: global_load_ubyte v7, v1, s[2:3] offset:7 @@ -1029,9 +1033,9 @@ define amdgpu_kernel void @v_cttz_zero_undef_i64_with_select(ptr addrspace(1) no ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(6) ; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, v2, 8, v0 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(5) -; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX9-GISEL-NEXT: s_waitcnt vmcnt(4) ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v2, 24, v4 +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(4) +; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX9-GISEL-NEXT: v_or3_b32 v2, v2, v3, v0 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(2) ; GFX9-GISEL-NEXT: v_lshl_or_b32 v4, v6, 8, v5 diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll index 14897b68bf57b..70fc83117f5a2 100644 --- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll +++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll @@ -1908,27 +1908,29 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr ; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v0 ; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v1, vcc -; VI-NEXT: flat_load_ubyte v12, v[2:3] -; VI-NEXT: flat_load_ubyte v2, v[8:9] -; VI-NEXT: flat_load_ubyte v3, v[10:11] +; VI-NEXT: flat_load_ubyte v6, v[6:7] ; VI-NEXT: flat_load_ubyte v4, v[4:5] ; VI-NEXT: flat_load_ubyte v5, v[0:1] -; VI-NEXT: flat_load_ubyte v6, v[6:7] +; VI-NEXT: flat_load_ubyte v12, v[2:3] +; VI-NEXT: flat_load_ubyte v3, v[10:11] +; VI-NEXT: flat_load_ubyte v2, v[8:9] ; VI-NEXT: v_add_u32_e32 v0, vcc, 4, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_ubyte v7, v[0:1] ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_cvt_f32_ubyte0_e32 v1, v6 ; VI-NEXT: s_waitcnt vmcnt(5) -; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v2 +; VI-NEXT: v_cvt_f32_ubyte0_e32 v6, v4 ; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_cvt_f32_ubyte0_e32 v3, v3 -; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v5 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_cvt_f32_ubyte0_e32 v1, v6 -; VI-NEXT: v_cvt_f32_ubyte0_e32 v6, v4 +; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_cvt_f32_ubyte0_e32 v5, v12 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_cvt_f32_ubyte0_e32 v3, v3 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte0_e32 v4, v7 ; VI-NEXT: buffer_store_dwordx3 v[4:6], off, s[0:3], 0 offset:16 @@ -1942,20 +1944,20 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr ; GFX10-NEXT: v_mov_b32_e32 v8, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x5 -; GFX10-NEXT: global_load_ubyte v4, v0, s[2:3] offset:6 ; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3] offset:3 ; GFX10-NEXT: global_load_ubyte v2, v0, s[2:3] offset:2 ; GFX10-NEXT: global_load_ubyte v5, v0, s[2:3] offset:1 +; GFX10-NEXT: global_load_ubyte v4, v0, s[2:3] offset:6 ; GFX10-NEXT: global_load_short_d16 v7, v0, s[2:3] offset:4 ; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(5) -; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v6, v4 -; GFX10-NEXT: s_waitcnt vmcnt(4) ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v3, v1 -; GFX10-NEXT: s_waitcnt vmcnt(3) +; GFX10-NEXT: s_waitcnt vmcnt(4) ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v2, v2 -; GFX10-NEXT: s_waitcnt vmcnt(2) +; GFX10-NEXT: s_waitcnt vmcnt(3) ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, v5 +; GFX10-NEXT: s_waitcnt vmcnt(2) +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v6, v4 ; GFX10-NEXT: s_waitcnt vmcnt(1) ; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v5, v7 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v4, v7 @@ -2003,20 +2005,20 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x5 -; GFX11-NEXT: global_load_u8 v4, v0, s[2:3] offset:6 ; GFX11-NEXT: global_load_u8 v1, v0, s[2:3] offset:3 ; GFX11-NEXT: global_load_u8 v2, v0, s[2:3] offset:2 ; GFX11-NEXT: global_load_u8 v5, v0, s[2:3] offset:1 +; GFX11-NEXT: global_load_u8 v4, v0, s[2:3] offset:6 ; GFX11-NEXT: global_load_d16_b16 v7, v0, s[2:3] offset:4 ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(5) -; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v6, v4 -; GFX11-NEXT: s_waitcnt vmcnt(4) ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v3, v1 -; GFX11-NEXT: s_waitcnt vmcnt(3) +; GFX11-NEXT: s_waitcnt vmcnt(4) ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v2, v2 -; GFX11-NEXT: s_waitcnt vmcnt(2) +; GFX11-NEXT: s_waitcnt vmcnt(3) ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v1, v5 +; GFX11-NEXT: s_waitcnt vmcnt(2) +; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v6, v4 ; GFX11-NEXT: s_waitcnt vmcnt(1) ; GFX11-NEXT: v_cvt_f32_ubyte1_e32 v5, v7 ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v4, v7 diff --git a/llvm/test/CodeGen/AMDGPU/div_i128.ll b/llvm/test/CodeGen/AMDGPU/div_i128.ll index df7f8c6f39b3f..9f617a914e81a 100644 --- a/llvm/test/CodeGen/AMDGPU/div_i128.ll +++ b/llvm/test/CodeGen/AMDGPU/div_i128.ll @@ -548,47 +548,52 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_readlane_b32 s4, v30, 4 ; GFX9-O0-NEXT: v_readlane_b32 s5, v30, 5 ; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_waitcnt vmcnt(1) +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_waitcnt vmcnt(3) ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_waitcnt vmcnt(1) +; GFX9-O0-NEXT: s_waitcnt vmcnt(3) ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(3) ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: s_waitcnt vmcnt(3) ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB0_9 ; GFX9-O0-NEXT: .LBB0_4: ; %udiv-loop-exit -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b32 s4, 1 -; GFX9-O0-NEXT: s_waitcnt vmcnt(2) +; GFX9-O0-NEXT: s_waitcnt vmcnt(6) ; GFX9-O0-NEXT: v_lshlrev_b64 v[2:3], s4, v[0:1] -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: s_waitcnt vmcnt(4) ; GFX9-O0-NEXT: v_lshlrev_b64 v[9:10], s4, v[9:10] ; GFX9-O0-NEXT: s_mov_b32 s4, 63 ; GFX9-O0-NEXT: v_lshrrev_b64 v[0:1], s4, v[0:1] ; GFX9-O0-NEXT: v_mov_b32_e32 v11, v1 ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v10 +; GFX9-O0-NEXT: s_waitcnt vmcnt(3) ; GFX9-O0-NEXT: v_mov_b32_e32 v12, v8 ; GFX9-O0-NEXT: v_or3_b32 v4, v4, v11, v12 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v9 +; GFX9-O0-NEXT: s_waitcnt vmcnt(2) ; GFX9-O0-NEXT: v_or3_b32 v0, v0, v1, v7 ; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4 ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-O0-NEXT: s_waitcnt vmcnt(1) ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v6 ; GFX9-O0-NEXT: v_or_b32_e64 v4, v4, v7 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v2 +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v5 ; GFX9-O0-NEXT: v_or_b32_e64 v2, v2, v3 ; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec @@ -608,26 +613,29 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_readlane_b32 s4, v30, 8 ; GFX9-O0-NEXT: v_readlane_b32 s5, v30, 9 ; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_waitcnt vmcnt(1) +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_waitcnt vmcnt(1) +; GFX9-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB0_4 ; GFX9-O0-NEXT: .LBB0_6: ; %udiv-do-while @@ -638,35 +646,36 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_readlane_b32 s6, v30, 10 ; GFX9-O0-NEXT: v_readlane_b32 s7, v30, 11 +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b32 s4, 63 -; GFX9-O0-NEXT: s_waitcnt vmcnt(16) +; GFX9-O0-NEXT: s_waitcnt vmcnt(22) ; GFX9-O0-NEXT: v_lshrrev_b64 v[28:29], s4, v[2:3] ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v29 ; GFX9-O0-NEXT: s_mov_b32 s5, 1 +; GFX9-O0-NEXT: s_waitcnt vmcnt(20) ; GFX9-O0-NEXT: v_lshlrev_b64 v[22:23], s5, v[22:23] ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v23 ; GFX9-O0-NEXT: v_or_b32_e64 v4, v4, v5 @@ -676,6 +685,7 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: ; kill: def $vgpr22 killed $vgpr22 def $vgpr22_vgpr23 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v23, v4 ; GFX9-O0-NEXT: v_lshlrev_b64 v[28:29], s5, v[2:3] +; GFX9-O0-NEXT: s_waitcnt vmcnt(18) ; GFX9-O0-NEXT: v_lshrrev_b64 v[4:5], s4, v[6:7] ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v29 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v5 @@ -685,25 +695,28 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_or_b32_e64 v4, v3, v4 ; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v2 +; GFX9-O0-NEXT: s_waitcnt vmcnt(16) ; GFX9-O0-NEXT: v_lshlrev_b64 v[2:3], s5, v[0:1] ; GFX9-O0-NEXT: v_lshlrev_b64 v[28:29], s5, v[6:7] ; GFX9-O0-NEXT: v_lshrrev_b64 v[0:1], s4, v[0:1] ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v1 ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v29 -; GFX9-O0-NEXT: s_waitcnt vmcnt(10) +; GFX9-O0-NEXT: s_waitcnt vmcnt(15) ; GFX9-O0-NEXT: v_mov_b32_e32 v10, v27 ; GFX9-O0-NEXT: v_or3_b32 v6, v6, v7, v10 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v28 +; GFX9-O0-NEXT: s_waitcnt vmcnt(14) ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v26 ; GFX9-O0-NEXT: v_or3_b32 v0, v0, v1, v7 ; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v6 ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v3 -; GFX9-O0-NEXT: s_waitcnt vmcnt(8) +; GFX9-O0-NEXT: s_waitcnt vmcnt(13) ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v25 ; GFX9-O0-NEXT: v_or_b32_e64 v6, v6, v7 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v2 +; GFX9-O0-NEXT: s_waitcnt vmcnt(12) ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v24 ; GFX9-O0-NEXT: v_or_b32_e64 v2, v2, v3 ; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec @@ -712,12 +725,14 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v10, v5 ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v22 ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v23 -; GFX9-O0-NEXT: s_waitcnt vmcnt(1) +; GFX9-O0-NEXT: s_waitcnt vmcnt(11) ; GFX9-O0-NEXT: v_mov_b32_e32 v13, v11 +; GFX9-O0-NEXT: s_waitcnt vmcnt(10) ; GFX9-O0-NEXT: v_mov_b32_e32 v11, v14 +; GFX9-O0-NEXT: s_waitcnt vmcnt(9) ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v15 ; GFX9-O0-NEXT: v_sub_co_u32_e32 v13, vcc, v13, v6 -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: s_waitcnt vmcnt(8) ; GFX9-O0-NEXT: v_subb_co_u32_e32 v12, vcc, v12, v10, vcc ; GFX9-O0-NEXT: v_subb_co_u32_e32 v11, vcc, v11, v4, vcc ; GFX9-O0-NEXT: v_subb_co_u32_e32 v7, vcc, v7, v5, vcc @@ -735,13 +750,17 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v15, v12 ; GFX9-O0-NEXT: v_mov_b32_e32 v12, 0 ; GFX9-O0-NEXT: v_mov_b32_e32 v13, 0 +; GFX9-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-O0-NEXT: v_mov_b32_e32 v22, v21 ; GFX9-O0-NEXT: v_and_b32_e64 v22, v7, v22 +; GFX9-O0-NEXT: s_waitcnt vmcnt(6) ; GFX9-O0-NEXT: v_and_b32_e64 v20, v11, v20 ; GFX9-O0-NEXT: ; kill: def $vgpr20 killed $vgpr20 def $vgpr20_vgpr21 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v21, v22 +; GFX9-O0-NEXT: s_waitcnt vmcnt(5) ; GFX9-O0-NEXT: v_mov_b32_e32 v22, v19 ; GFX9-O0-NEXT: v_and_b32_e64 v7, v7, v22 +; GFX9-O0-NEXT: s_waitcnt vmcnt(4) ; GFX9-O0-NEXT: v_and_b32_e64 v22, v11, v18 ; GFX9-O0-NEXT: ; kill: def $vgpr22 killed $vgpr22 def $vgpr22_vgpr23 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v23, v7 @@ -757,12 +776,15 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v7 ; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v10 +; GFX9-O0-NEXT: s_waitcnt vmcnt(3) ; GFX9-O0-NEXT: v_mov_b32_e32 v11, v8 ; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 killed $vgpr8_vgpr9 killed $exec ; GFX9-O0-NEXT: s_mov_b64 s[8:9], -1 ; GFX9-O0-NEXT: s_mov_b32 s5, s8 ; GFX9-O0-NEXT: s_mov_b32 s4, s9 +; GFX9-O0-NEXT: s_waitcnt vmcnt(1) ; GFX9-O0-NEXT: v_mov_b32_e32 v10, v16 +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_mov_b32_e32 v8, v17 ; GFX9-O0-NEXT: v_mov_b32_e32 v16, s5 ; GFX9-O0-NEXT: v_add_co_u32_e32 v19, vcc, v11, v16 @@ -850,29 +872,30 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_waitcnt vmcnt(9) +; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_waitcnt vmcnt(15) ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v10 -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: s_waitcnt vmcnt(13) ; GFX9-O0-NEXT: v_lshrrev_b64 v[6:7], v4, v[18:19] ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v7 ; GFX9-O0-NEXT: s_mov_b32 s4, 64 ; GFX9-O0-NEXT: v_sub_u32_e64 v20, s4, v4 +; GFX9-O0-NEXT: s_waitcnt vmcnt(11) ; GFX9-O0-NEXT: v_lshlrev_b64 v[20:21], v20, v[14:15] ; GFX9-O0-NEXT: v_mov_b32_e32 v22, v21 ; GFX9-O0-NEXT: v_or_b32_e64 v5, v5, v22 @@ -910,12 +933,16 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[6:7] ; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v14 +; GFX9-O0-NEXT: s_waitcnt vmcnt(10) ; GFX9-O0-NEXT: v_mov_b32_e32 v15, v12 +; GFX9-O0-NEXT: s_waitcnt vmcnt(9) ; GFX9-O0-NEXT: v_mov_b32_e32 v12, v13 ; GFX9-O0-NEXT: s_mov_b64 s[8:9], -1 ; GFX9-O0-NEXT: s_mov_b32 s7, s8 ; GFX9-O0-NEXT: s_mov_b32 s6, s9 +; GFX9-O0-NEXT: s_waitcnt vmcnt(8) ; GFX9-O0-NEXT: v_mov_b32_e32 v14, v16 +; GFX9-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-O0-NEXT: v_mov_b32_e32 v13, v17 ; GFX9-O0-NEXT: v_mov_b32_e32 v16, s7 ; GFX9-O0-NEXT: v_add_co_u32_e32 v16, vcc, v15, v16 @@ -951,10 +978,11 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_nop 0 ; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: s_waitcnt vmcnt(16) ; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(16) ; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: s_waitcnt vmcnt(16) ; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 @@ -962,39 +990,43 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 ; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(20) ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: s_waitcnt vmcnt(20) ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(20) ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: s_waitcnt vmcnt(20) ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB0_6 ; GFX9-O0-NEXT: .LBB0_8: ; %udiv-bb1 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] -; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 1 ; GFX9-O0-NEXT: s_mov_b32 s5, s6 -; GFX9-O0-NEXT: s_waitcnt vmcnt(1) +; GFX9-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v0 ; GFX9-O0-NEXT: s_mov_b32 s4, s7 ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0 ; GFX9-O0-NEXT: s_mov_b32 s8, s6 ; GFX9-O0-NEXT: s_mov_b32 s9, s7 +; GFX9-O0-NEXT: s_waitcnt vmcnt(6) ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-O0-NEXT: s_waitcnt vmcnt(5) ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v5 ; GFX9-O0-NEXT: v_mov_b32_e32 v4, s5 ; GFX9-O0-NEXT: v_add_co_u32_e32 v8, vcc, v3, v4 ; GFX9-O0-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: s_waitcnt vmcnt(4) ; GFX9-O0-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc ; GFX9-O0-NEXT: v_mov_b32_e32 v4, s8 ; GFX9-O0-NEXT: v_addc_co_u32_e32 v0, vcc, v0, v4, vcc @@ -1016,10 +1048,12 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b32 s4, 0x7f ; GFX9-O0-NEXT: v_sub_u32_e64 v2, s4, v3 +; GFX9-O0-NEXT: s_waitcnt vmcnt(6) ; GFX9-O0-NEXT: v_lshlrev_b64 v[4:5], v2, v[10:11] ; GFX9-O0-NEXT: v_mov_b32_e32 v12, v5 ; GFX9-O0-NEXT: s_mov_b32 s4, 64 ; GFX9-O0-NEXT: v_sub_u32_e64 v13, s4, v2 +; GFX9-O0-NEXT: s_waitcnt vmcnt(4) ; GFX9-O0-NEXT: v_lshrrev_b64 v[13:14], v13, v[6:7] ; GFX9-O0-NEXT: v_mov_b32_e32 v15, v14 ; GFX9-O0-NEXT: v_or_b32_e64 v12, v12, v15 @@ -1098,28 +1132,35 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_cbranch_execz .LBB0_5 ; GFX9-O0-NEXT: s_branch .LBB0_7 ; GFX9-O0-NEXT: .LBB0_9: ; %udiv-end -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v8 +; GFX9-O0-NEXT: s_waitcnt vmcnt(6) ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v6 ; GFX9-O0-NEXT: v_xor_b32_e64 v3, v3, v2 +; GFX9-O0-NEXT: s_waitcnt vmcnt(5) ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v7 ; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 killed $vgpr5_vgpr6 killed $exec +; GFX9-O0-NEXT: s_waitcnt vmcnt(4) ; GFX9-O0-NEXT: v_xor_b32_e64 v8, v5, v4 ; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v9, v3 +; GFX9-O0-NEXT: s_waitcnt vmcnt(3) ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v11 +; GFX9-O0-NEXT: s_waitcnt vmcnt(2) ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-O0-NEXT: v_xor_b32_e64 v3, v3, v6 +; GFX9-O0-NEXT: s_waitcnt vmcnt(1) ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v10 ; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_xor_b32_e64 v0, v0, v7 ; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3 @@ -1661,18 +1702,18 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_branch .LBB0_9 ; GFX9-G-O0-NEXT: .LBB0_4: ; %udiv-loop-exit -; GFX9-G-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(2) +; GFX9-G-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(6) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v4 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v5 -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(4) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v6 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v7 ; GFX9-G-O0-NEXT: s_mov_b32 s4, 1 @@ -1687,8 +1728,10 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: v_lshrrev_b32_e64 v5, v2, v3 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v0 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(2) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v11 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v12 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v13 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v14 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v1 @@ -1722,26 +1765,29 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: v_readlane_b32 s4, v31, 4 ; GFX9-G-O0-NEXT: v_readlane_b32 s5, v31, 5 ; GFX9-G-O0-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3) +; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3) +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-G-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3) +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-G-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3) +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-G-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: s_nop 0 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_branch .LBB0_4 ; GFX9-G-O0-NEXT: .LBB0_6: ; %udiv-do-while @@ -1752,35 +1798,35 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-G-O0-NEXT: v_readlane_b32 s6, v31, 6 ; GFX9-G-O0-NEXT: v_readlane_b32 s7, v31, 7 -; GFX9-G-O0-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(18) +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(22) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v3 -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(16) +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(20) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, v5 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v4 ; GFX9-G-O0-NEXT: s_mov_b32 s8, 1 @@ -1797,6 +1843,7 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 killed $vgpr4_vgpr5 killed $exec ; GFX9-G-O0-NEXT: v_or_b32_e64 v7, v0, v1 ; GFX9-G-O0-NEXT: ; kill: def $vgpr0_vgpr1 killed $vgpr22_vgpr23 killed $exec +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(16) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v24 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v25 ; GFX9-G-O0-NEXT: ; kill: def $vgpr4 killed $vgpr0 killed $exec @@ -1822,10 +1869,10 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: v_lshrrev_b32_e64 v15, v2, v3 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v0 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(10) +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(14) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v27 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v28 -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(8) +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(12) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v23, v29 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v24, v30 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v1 @@ -1845,10 +1892,13 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: ; kill: def $vgpr0_vgpr1 killed $vgpr0_vgpr1 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec ; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v14 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v15 -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(11) ; GFX9-G-O0-NEXT: v_sub_co_u32_e64 v13, s[8:9], v13, v4 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(10) ; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v12, s[8:9], v12, v9, s[8:9] +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(9) ; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v10, s[8:9], v10, v7, s[8:9] +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(8) ; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v12, s[8:9], v6, v5, s[8:9] ; GFX9-G-O0-NEXT: s_mov_b32 s8, 31 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, s8 @@ -1867,9 +1917,13 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: ; kill: def $vgpr12_vgpr13 killed $vgpr12_vgpr13 def $vgpr12_vgpr13_vgpr14_vgpr15 killed $exec ; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v22 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, v23 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-G-O0-NEXT: v_and_b32_e64 v11, v10, v11 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(6) ; GFX9-G-O0-NEXT: v_and_b32_e64 v10, v10, v21 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(5) ; GFX9-G-O0-NEXT: v_and_b32_e64 v8, v6, v8 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(4) ; GFX9-G-O0-NEXT: v_and_b32_e64 v6, v6, v20 ; GFX9-G-O0-NEXT: v_sub_co_u32_e64 v4, s[8:9], v4, v11 ; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v10, s[8:9], v9, v10, s[8:9] @@ -1879,9 +1933,13 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v10 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v9 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v8 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v16 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(2) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v17 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(1) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v18 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v19 ; GFX9-G-O0-NEXT: s_mov_b32 s8, -1 ; GFX9-G-O0-NEXT: s_mov_b32 s12, -1 @@ -1961,32 +2019,34 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-G-O0-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: s_mov_b64 exec, s[20:21] -; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: s_mov_b32 s4, 64 -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3) +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(16) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v22, v17 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(15) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v23, v16 ; GFX9-G-O0-NEXT: ; kill: def $vgpr20 killed $vgpr20 def $vgpr20_vgpr21 killed $exec -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(1) +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(13) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v21, v4 ; GFX9-G-O0-NEXT: s_mov_b32 s5, 0xffffffc0 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, s5 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(12) ; GFX9-G-O0-NEXT: v_add_u32_e64 v4, v18, v4 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, s4 ; GFX9-G-O0-NEXT: v_sub_u32_e64 v5, v5, v18 @@ -1995,7 +2055,6 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: v_cmp_lt_u32_e64 s[4:5], v18, v6 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, s6 ; GFX9-G-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v18, v6 -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-G-O0-NEXT: v_lshrrev_b64 v[6:7], v18, v[20:21] ; GFX9-G-O0-NEXT: v_lshrrev_b64 v[25:26], v18, v[22:23] ; GFX9-G-O0-NEXT: v_lshlrev_b64 v[23:24], v5, v[20:21] @@ -2031,15 +2090,19 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: s_mov_b32 s7, -1 ; GFX9-G-O0-NEXT: s_mov_b32 s6, -1 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v16, s4 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(11) ; GFX9-G-O0-NEXT: v_add_co_u32_e64 v15, s[4:5], v15, v16 ; GFX9-G-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, s10 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(11) ; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v14, s[4:5], v14, v15, s[4:5] ; GFX9-G-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, s7 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(11) ; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v13, s[4:5], v13, v14, s[4:5] ; GFX9-G-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, s6 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(11) ; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v12, s[4:5], v12, v13, s[4:5] ; GFX9-G-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_mov_b64 s[4:5], s[8:9] @@ -2058,49 +2121,57 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(16) ; GFX9-G-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: s_nop 0 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(16) ; GFX9-G-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(16) ; GFX9-G-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(16) ; GFX9-G-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_nop 0 ; GFX9-G-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(20) ; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: s_nop 0 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(20) ; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(20) ; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(20) ; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_branch .LBB0_6 ; GFX9-G-O0-NEXT: .LBB0_8: ; %udiv-bb1 ; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-G-O0-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: s_mov_b64 exec, s[20:21] -; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-G-O0-NEXT: s_mov_b32 s6, 1 ; GFX9-G-O0-NEXT: s_mov_b32 s10, 0 ; GFX9-G-O0-NEXT: s_mov_b32 s9, 0 ; GFX9-G-O0-NEXT: s_mov_b32 s8, 0 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, s6 -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3) +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-G-O0-NEXT: v_add_co_u32_e64 v4, s[6:7], v2, v4 ; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, s10 -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(1) +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v5, s[6:7], v5, v7, s[6:7] ; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, s9 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(6) ; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v7, s[6:7], v6, v7, s[6:7] ; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, s8 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(5) ; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v6, s[6:7], v1, v6, s[6:7] ; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v4 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v5 @@ -2116,8 +2187,11 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: v_sub_co_u32_e64 v8, s[6:7], v1, v2 ; GFX9-G-O0-NEXT: s_mov_b32 s7, 64 ; GFX9-G-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v0 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(6) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v9 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(5) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v3 ; GFX9-G-O0-NEXT: s_mov_b32 s6, 0xffffffc0 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s6 @@ -2198,27 +2272,31 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: s_cbranch_execz .LBB0_5 ; GFX9-G-O0-NEXT: s_branch .LBB0_7 ; GFX9-G-O0-NEXT: .LBB0_9: ; %udiv-end -; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(2) +; GFX9-G-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(6) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v8 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v9 -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(4) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v10 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v11 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v1 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v2 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3) ; GFX9-G-O0-NEXT: v_xor_b32_e64 v0, v0, v7 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(2) ; GFX9-G-O0-NEXT: v_xor_b32_e64 v1, v1, v6 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v8 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v9 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(1) ; GFX9-G-O0-NEXT: v_xor_b32_e64 v2, v2, v5 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-G-O0-NEXT: v_xor_b32_e64 v3, v3, v4 ; GFX9-G-O0-NEXT: v_sub_co_u32_e64 v0, s[4:5], v0, v7 ; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v1, s[4:5], v1, v6, s[4:5] @@ -2671,47 +2749,52 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_readlane_b32 s4, v30, 2 ; GFX9-O0-NEXT: v_readlane_b32 s5, v30, 3 ; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_waitcnt vmcnt(1) +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_waitcnt vmcnt(3) ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_waitcnt vmcnt(1) +; GFX9-O0-NEXT: s_waitcnt vmcnt(3) ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(3) ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: s_waitcnt vmcnt(3) ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB1_9 ; GFX9-O0-NEXT: .LBB1_4: ; %udiv-loop-exit -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b32 s4, 1 -; GFX9-O0-NEXT: s_waitcnt vmcnt(2) +; GFX9-O0-NEXT: s_waitcnt vmcnt(6) ; GFX9-O0-NEXT: v_lshlrev_b64 v[2:3], s4, v[0:1] -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: s_waitcnt vmcnt(4) ; GFX9-O0-NEXT: v_lshlrev_b64 v[9:10], s4, v[9:10] ; GFX9-O0-NEXT: s_mov_b32 s4, 63 ; GFX9-O0-NEXT: v_lshrrev_b64 v[0:1], s4, v[0:1] ; GFX9-O0-NEXT: v_mov_b32_e32 v11, v1 ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v10 +; GFX9-O0-NEXT: s_waitcnt vmcnt(3) ; GFX9-O0-NEXT: v_mov_b32_e32 v12, v8 ; GFX9-O0-NEXT: v_or3_b32 v4, v4, v11, v12 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v9 +; GFX9-O0-NEXT: s_waitcnt vmcnt(2) ; GFX9-O0-NEXT: v_or3_b32 v0, v0, v1, v7 ; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4 ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-O0-NEXT: s_waitcnt vmcnt(1) ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v6 ; GFX9-O0-NEXT: v_or_b32_e64 v4, v4, v7 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v2 +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v5 ; GFX9-O0-NEXT: v_or_b32_e64 v2, v2, v3 ; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec @@ -2731,26 +2814,29 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_readlane_b32 s4, v30, 6 ; GFX9-O0-NEXT: v_readlane_b32 s5, v30, 7 ; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_waitcnt vmcnt(1) +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_waitcnt vmcnt(1) +; GFX9-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB1_4 ; GFX9-O0-NEXT: .LBB1_6: ; %udiv-do-while @@ -2761,35 +2847,36 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_readlane_b32 s6, v30, 8 ; GFX9-O0-NEXT: v_readlane_b32 s7, v30, 9 +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b32 s4, 63 -; GFX9-O0-NEXT: s_waitcnt vmcnt(16) +; GFX9-O0-NEXT: s_waitcnt vmcnt(22) ; GFX9-O0-NEXT: v_lshrrev_b64 v[28:29], s4, v[2:3] ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v29 ; GFX9-O0-NEXT: s_mov_b32 s5, 1 +; GFX9-O0-NEXT: s_waitcnt vmcnt(20) ; GFX9-O0-NEXT: v_lshlrev_b64 v[22:23], s5, v[22:23] ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v23 ; GFX9-O0-NEXT: v_or_b32_e64 v4, v4, v5 @@ -2799,6 +2886,7 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: ; kill: def $vgpr22 killed $vgpr22 def $vgpr22_vgpr23 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v23, v4 ; GFX9-O0-NEXT: v_lshlrev_b64 v[28:29], s5, v[2:3] +; GFX9-O0-NEXT: s_waitcnt vmcnt(18) ; GFX9-O0-NEXT: v_lshrrev_b64 v[4:5], s4, v[6:7] ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v29 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v5 @@ -2808,25 +2896,28 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_or_b32_e64 v4, v3, v4 ; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v2 +; GFX9-O0-NEXT: s_waitcnt vmcnt(16) ; GFX9-O0-NEXT: v_lshlrev_b64 v[2:3], s5, v[0:1] ; GFX9-O0-NEXT: v_lshlrev_b64 v[28:29], s5, v[6:7] ; GFX9-O0-NEXT: v_lshrrev_b64 v[0:1], s4, v[0:1] ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v1 ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v29 -; GFX9-O0-NEXT: s_waitcnt vmcnt(10) +; GFX9-O0-NEXT: s_waitcnt vmcnt(15) ; GFX9-O0-NEXT: v_mov_b32_e32 v10, v27 ; GFX9-O0-NEXT: v_or3_b32 v6, v6, v7, v10 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v28 +; GFX9-O0-NEXT: s_waitcnt vmcnt(14) ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v26 ; GFX9-O0-NEXT: v_or3_b32 v0, v0, v1, v7 ; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v6 ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v3 -; GFX9-O0-NEXT: s_waitcnt vmcnt(8) +; GFX9-O0-NEXT: s_waitcnt vmcnt(13) ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v25 ; GFX9-O0-NEXT: v_or_b32_e64 v6, v6, v7 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v2 +; GFX9-O0-NEXT: s_waitcnt vmcnt(12) ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v24 ; GFX9-O0-NEXT: v_or_b32_e64 v2, v2, v3 ; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec @@ -2835,12 +2926,14 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v10, v5 ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v22 ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v23 -; GFX9-O0-NEXT: s_waitcnt vmcnt(1) +; GFX9-O0-NEXT: s_waitcnt vmcnt(11) ; GFX9-O0-NEXT: v_mov_b32_e32 v13, v11 +; GFX9-O0-NEXT: s_waitcnt vmcnt(10) ; GFX9-O0-NEXT: v_mov_b32_e32 v11, v14 +; GFX9-O0-NEXT: s_waitcnt vmcnt(9) ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v15 ; GFX9-O0-NEXT: v_sub_co_u32_e32 v13, vcc, v13, v6 -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: s_waitcnt vmcnt(8) ; GFX9-O0-NEXT: v_subb_co_u32_e32 v12, vcc, v12, v10, vcc ; GFX9-O0-NEXT: v_subb_co_u32_e32 v11, vcc, v11, v4, vcc ; GFX9-O0-NEXT: v_subb_co_u32_e32 v7, vcc, v7, v5, vcc @@ -2858,13 +2951,17 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v15, v12 ; GFX9-O0-NEXT: v_mov_b32_e32 v12, 0 ; GFX9-O0-NEXT: v_mov_b32_e32 v13, 0 +; GFX9-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-O0-NEXT: v_mov_b32_e32 v22, v21 ; GFX9-O0-NEXT: v_and_b32_e64 v22, v7, v22 +; GFX9-O0-NEXT: s_waitcnt vmcnt(6) ; GFX9-O0-NEXT: v_and_b32_e64 v20, v11, v20 ; GFX9-O0-NEXT: ; kill: def $vgpr20 killed $vgpr20 def $vgpr20_vgpr21 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v21, v22 +; GFX9-O0-NEXT: s_waitcnt vmcnt(5) ; GFX9-O0-NEXT: v_mov_b32_e32 v22, v19 ; GFX9-O0-NEXT: v_and_b32_e64 v7, v7, v22 +; GFX9-O0-NEXT: s_waitcnt vmcnt(4) ; GFX9-O0-NEXT: v_and_b32_e64 v22, v11, v18 ; GFX9-O0-NEXT: ; kill: def $vgpr22 killed $vgpr22 def $vgpr22_vgpr23 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v23, v7 @@ -2880,12 +2977,15 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v7 ; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v10 +; GFX9-O0-NEXT: s_waitcnt vmcnt(3) ; GFX9-O0-NEXT: v_mov_b32_e32 v11, v8 ; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 killed $vgpr8_vgpr9 killed $exec ; GFX9-O0-NEXT: s_mov_b64 s[8:9], -1 ; GFX9-O0-NEXT: s_mov_b32 s5, s8 ; GFX9-O0-NEXT: s_mov_b32 s4, s9 +; GFX9-O0-NEXT: s_waitcnt vmcnt(1) ; GFX9-O0-NEXT: v_mov_b32_e32 v10, v16 +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_mov_b32_e32 v8, v17 ; GFX9-O0-NEXT: v_mov_b32_e32 v16, s5 ; GFX9-O0-NEXT: v_add_co_u32_e32 v19, vcc, v11, v16 @@ -2973,29 +3073,30 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_waitcnt vmcnt(9) +; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_waitcnt vmcnt(15) ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v10 -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: s_waitcnt vmcnt(13) ; GFX9-O0-NEXT: v_lshrrev_b64 v[6:7], v4, v[18:19] ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v7 ; GFX9-O0-NEXT: s_mov_b32 s4, 64 ; GFX9-O0-NEXT: v_sub_u32_e64 v20, s4, v4 +; GFX9-O0-NEXT: s_waitcnt vmcnt(11) ; GFX9-O0-NEXT: v_lshlrev_b64 v[20:21], v20, v[14:15] ; GFX9-O0-NEXT: v_mov_b32_e32 v22, v21 ; GFX9-O0-NEXT: v_or_b32_e64 v5, v5, v22 @@ -3033,12 +3134,16 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[6:7] ; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v14 +; GFX9-O0-NEXT: s_waitcnt vmcnt(10) ; GFX9-O0-NEXT: v_mov_b32_e32 v15, v12 +; GFX9-O0-NEXT: s_waitcnt vmcnt(9) ; GFX9-O0-NEXT: v_mov_b32_e32 v12, v13 ; GFX9-O0-NEXT: s_mov_b64 s[8:9], -1 ; GFX9-O0-NEXT: s_mov_b32 s7, s8 ; GFX9-O0-NEXT: s_mov_b32 s6, s9 +; GFX9-O0-NEXT: s_waitcnt vmcnt(8) ; GFX9-O0-NEXT: v_mov_b32_e32 v14, v16 +; GFX9-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-O0-NEXT: v_mov_b32_e32 v13, v17 ; GFX9-O0-NEXT: v_mov_b32_e32 v16, s7 ; GFX9-O0-NEXT: v_add_co_u32_e32 v16, vcc, v15, v16 @@ -3074,10 +3179,11 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_nop 0 ; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: s_waitcnt vmcnt(16) ; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(16) ; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: s_waitcnt vmcnt(16) ; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 @@ -3085,39 +3191,43 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 ; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(20) ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: s_waitcnt vmcnt(20) ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(20) ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: s_waitcnt vmcnt(20) ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB1_6 ; GFX9-O0-NEXT: .LBB1_8: ; %udiv-bb1 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] -; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 1 ; GFX9-O0-NEXT: s_mov_b32 s5, s6 -; GFX9-O0-NEXT: s_waitcnt vmcnt(1) +; GFX9-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v0 ; GFX9-O0-NEXT: s_mov_b32 s4, s7 ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0 ; GFX9-O0-NEXT: s_mov_b32 s8, s6 ; GFX9-O0-NEXT: s_mov_b32 s9, s7 +; GFX9-O0-NEXT: s_waitcnt vmcnt(6) ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-O0-NEXT: s_waitcnt vmcnt(5) ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v5 ; GFX9-O0-NEXT: v_mov_b32_e32 v4, s5 ; GFX9-O0-NEXT: v_add_co_u32_e32 v8, vcc, v3, v4 ; GFX9-O0-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: s_waitcnt vmcnt(4) ; GFX9-O0-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc ; GFX9-O0-NEXT: v_mov_b32_e32 v4, s8 ; GFX9-O0-NEXT: v_addc_co_u32_e32 v0, vcc, v0, v4, vcc @@ -3139,10 +3249,12 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b32 s4, 0x7f ; GFX9-O0-NEXT: v_sub_u32_e64 v2, s4, v3 +; GFX9-O0-NEXT: s_waitcnt vmcnt(6) ; GFX9-O0-NEXT: v_lshlrev_b64 v[4:5], v2, v[10:11] ; GFX9-O0-NEXT: v_mov_b32_e32 v12, v5 ; GFX9-O0-NEXT: s_mov_b32 s4, 64 ; GFX9-O0-NEXT: v_sub_u32_e64 v13, s4, v2 +; GFX9-O0-NEXT: s_waitcnt vmcnt(4) ; GFX9-O0-NEXT: v_lshrrev_b64 v[13:14], v13, v[6:7] ; GFX9-O0-NEXT: v_mov_b32_e32 v15, v14 ; GFX9-O0-NEXT: v_or_b32_e64 v12, v12, v15 @@ -3221,14 +3333,15 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_cbranch_execz .LBB1_5 ; GFX9-O0-NEXT: s_branch .LBB1_7 ; GFX9-O0-NEXT: .LBB1_9: ; %udiv-end -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b32 s4, 32 -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: s_waitcnt vmcnt(2) ; GFX9-O0-NEXT: v_lshrrev_b64 v[0:1], s4, v[6:7] ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_lshrrev_b64 v[2:3], s4, v[4:5] ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v2 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v6 @@ -3691,18 +3804,18 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_branch .LBB1_9 ; GFX9-G-O0-NEXT: .LBB1_4: ; %udiv-loop-exit -; GFX9-G-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(2) +; GFX9-G-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(6) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v4 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v5 -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(4) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v6 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v7 ; GFX9-G-O0-NEXT: s_mov_b32 s4, 1 @@ -3717,8 +3830,10 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: v_lshrrev_b32_e64 v5, v2, v3 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v0 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(2) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v11 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v12 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v13 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v14 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v1 @@ -3752,26 +3867,29 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: v_readlane_b32 s4, v32, 4 ; GFX9-G-O0-NEXT: v_readlane_b32 s5, v32, 5 ; GFX9-G-O0-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3) +; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3) +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-G-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3) +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-G-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3) +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-G-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: s_nop 0 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_branch .LBB1_4 ; GFX9-G-O0-NEXT: .LBB1_6: ; %udiv-do-while @@ -3782,35 +3900,35 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-G-O0-NEXT: v_readlane_b32 s6, v32, 6 ; GFX9-G-O0-NEXT: v_readlane_b32 s7, v32, 7 -; GFX9-G-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(18) +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(22) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v3 -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(16) +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(20) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v21, v5 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v20, v4 ; GFX9-G-O0-NEXT: s_mov_b32 s8, 1 @@ -3827,6 +3945,7 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 killed $vgpr4_vgpr5 killed $exec ; GFX9-G-O0-NEXT: v_or_b32_e64 v7, v0, v1 ; GFX9-G-O0-NEXT: ; kill: def $vgpr0_vgpr1 killed $vgpr12_vgpr13 killed $exec +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(16) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v14 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v15 ; GFX9-G-O0-NEXT: ; kill: def $vgpr4 killed $vgpr0 killed $exec @@ -3852,10 +3971,10 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: v_lshrrev_b32_e64 v13, v2, v3 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, v0 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(10) +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(14) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v28 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v29 -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(8) +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(12) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v20, v30 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v21, v31 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v1 @@ -3875,10 +3994,13 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: ; kill: def $vgpr0_vgpr1 killed $vgpr0_vgpr1 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec ; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v12 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v13 -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(11) ; GFX9-G-O0-NEXT: v_sub_co_u32_e64 v11, s[8:9], v11, v4 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(10) ; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v10, s[8:9], v10, v9, s[8:9] +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(9) ; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v8, s[8:9], v8, v7, s[8:9] +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(8) ; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v10, s[8:9], v6, v5, s[8:9] ; GFX9-G-O0-NEXT: s_mov_b32 s8, 31 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, s8 @@ -3897,8 +4019,10 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: ; kill: def $vgpr12_vgpr13 killed $vgpr12_vgpr13 def $vgpr12_vgpr13_vgpr14_vgpr15 killed $exec ; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, v11 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v10 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(6) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v22, v24 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v23, v25 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(4) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v20, v26 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v21, v27 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v22 @@ -3917,9 +4041,13 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v10 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v9 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v8 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v16 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(2) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v17 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(1) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v18 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v19 ; GFX9-G-O0-NEXT: s_mov_b32 s8, -1 ; GFX9-G-O0-NEXT: s_mov_b32 s12, -1 @@ -3999,32 +4127,33 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GFX9-G-O0-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: s_mov_b64 exec, s[18:19] -; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: s_mov_b32 s4, 64 -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(2) +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(15) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, v5 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v4 -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(13) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v21, v7 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v20, v6 ; GFX9-G-O0-NEXT: s_mov_b32 s5, 0xffffffc0 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, s5 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(12) ; GFX9-G-O0-NEXT: v_add_u32_e64 v4, v12, v4 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, s4 ; GFX9-G-O0-NEXT: v_sub_u32_e64 v5, v5, v12 @@ -4065,9 +4194,13 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: ; kill: def $vgpr4_vgpr5 killed $vgpr4_vgpr5 def $vgpr4_vgpr5_vgpr6_vgpr7 killed $exec ; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v12 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v13 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(11) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, v16 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(10) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v17 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(9) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v18 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(8) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, v19 ; GFX9-G-O0-NEXT: s_mov_b32 s4, -1 ; GFX9-G-O0-NEXT: s_mov_b32 s10, -1 @@ -4101,49 +4234,57 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(16) ; GFX9-G-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: s_nop 0 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(16) ; GFX9-G-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(16) ; GFX9-G-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(16) ; GFX9-G-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_nop 0 ; GFX9-G-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(20) ; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: s_nop 0 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(20) ; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(20) ; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(20) ; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_branch .LBB1_6 ; GFX9-G-O0-NEXT: .LBB1_8: ; %udiv-bb1 ; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GFX9-G-O0-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-G-O0-NEXT: s_mov_b32 s6, 1 ; GFX9-G-O0-NEXT: s_mov_b32 s10, 0 ; GFX9-G-O0-NEXT: s_mov_b32 s9, 0 ; GFX9-G-O0-NEXT: s_mov_b32 s8, 0 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, s6 -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3) +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-G-O0-NEXT: v_add_co_u32_e64 v4, s[6:7], v1, v4 ; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, s10 -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(1) +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v5, s[6:7], v3, v5, s[6:7] ; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(6) ; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v7, s[6:7], v2, v3, s[6:7] ; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(5) ; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v6, s[6:7], v0, v2, s[6:7] ; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, v4 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v5 @@ -4158,6 +4299,7 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-G-O0-NEXT: v_sub_co_u32_e64 v3, s[6:7], v0, v1 ; GFX9-G-O0-NEXT: s_mov_b32 s7, 64 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v9 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, v8 ; GFX9-G-O0-NEXT: s_mov_b32 s6, 0xffffffc0 @@ -4172,6 +4314,7 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v3, v0 ; GFX9-G-O0-NEXT: v_lshlrev_b64 v[0:1], v3, v[12:13] ; GFX9-G-O0-NEXT: v_lshrrev_b64 v[17:18], v8, v[12:13] +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(5) ; GFX9-G-O0-NEXT: v_lshlrev_b64 v[15:16], v3, v[10:11] ; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v17 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v18 diff --git a/llvm/test/CodeGen/AMDGPU/ds-alignment.ll b/llvm/test/CodeGen/AMDGPU/ds-alignment.ll index 5dcf5d437bae6..d85e31cfee807 100644 --- a/llvm/test/CodeGen/AMDGPU/ds-alignment.ll +++ b/llvm/test/CodeGen/AMDGPU/ds-alignment.ll @@ -237,21 +237,24 @@ define amdgpu_kernel void @ds8align1(ptr addrspace(3) %in, ptr addrspace(3) %out ; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; ALIGNED-SDAG-NEXT: ds_read_u8 v5, v0 offset:4 +; ALIGNED-SDAG-NEXT: ds_read_u8 v6, v0 offset:5 ; ALIGNED-SDAG-NEXT: ds_read_u8 v2, v0 ; ALIGNED-SDAG-NEXT: ds_read_u8 v3, v0 offset:1 ; ALIGNED-SDAG-NEXT: ds_read_u8 v4, v0 offset:2 -; ALIGNED-SDAG-NEXT: ds_read_u8 v5, v0 offset:4 -; ALIGNED-SDAG-NEXT: ds_read_u8 v6, v0 offset:5 ; ALIGNED-SDAG-NEXT: ds_read_u8 v7, v0 offset:3 ; ALIGNED-SDAG-NEXT: ds_read_u8 v8, v0 offset:6 ; ALIGNED-SDAG-NEXT: ds_read_u8 v0, v0 offset:7 ; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v1, s1 -; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(4) +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(7) ; ALIGNED-SDAG-NEXT: ds_write_b8 v1, v5 offset:4 -; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(4) +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(7) ; ALIGNED-SDAG-NEXT: ds_write_b8 v1, v6 offset:5 +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(7) ; ALIGNED-SDAG-NEXT: ds_write_b8 v1, v2 +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(7) ; ALIGNED-SDAG-NEXT: ds_write_b8 v1, v3 offset:1 +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(7) ; ALIGNED-SDAG-NEXT: ds_write_b8 v1, v4 offset:2 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(7) ; ALIGNED-SDAG-NEXT: ds_write_b8 v1, v7 offset:3 @@ -449,30 +452,38 @@ define amdgpu_kernel void @ds12align1(ptr addrspace(3) %in, ptr addrspace(3) %ou ; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; ALIGNED-SDAG-NEXT: ds_read_u8 v9, v0 offset:8 +; ALIGNED-SDAG-NEXT: ds_read_u8 v10, v0 offset:9 +; ALIGNED-SDAG-NEXT: ds_read_u8 v5, v0 offset:4 +; ALIGNED-SDAG-NEXT: ds_read_u8 v6, v0 offset:5 ; ALIGNED-SDAG-NEXT: ds_read_u8 v1, v0 ; ALIGNED-SDAG-NEXT: ds_read_u8 v2, v0 offset:1 ; ALIGNED-SDAG-NEXT: ds_read_u8 v3, v0 offset:2 ; ALIGNED-SDAG-NEXT: ds_read_u8 v4, v0 offset:3 -; ALIGNED-SDAG-NEXT: ds_read_u8 v5, v0 offset:4 -; ALIGNED-SDAG-NEXT: ds_read_u8 v6, v0 offset:5 ; ALIGNED-SDAG-NEXT: ds_read_u8 v7, v0 offset:6 ; ALIGNED-SDAG-NEXT: ds_read_u8 v8, v0 offset:7 -; ALIGNED-SDAG-NEXT: ds_read_u8 v9, v0 offset:8 -; ALIGNED-SDAG-NEXT: ds_read_u8 v10, v0 offset:9 ; ALIGNED-SDAG-NEXT: ds_read_u8 v11, v0 offset:10 ; ALIGNED-SDAG-NEXT: ds_read_u8 v0, v0 offset:11 ; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v12, s1 -; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(11) ; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v9 offset:8 -; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(11) ; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v10 offset:9 +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(11) ; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v5 offset:4 +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(11) ; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v6 offset:5 +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(11) ; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v1 +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(11) ; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v2 offset:1 +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(11) ; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v3 offset:2 +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(11) ; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v4 offset:3 +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(11) ; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v7 offset:6 +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(11) ; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v8 offset:7 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(11) ; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v11 offset:10 @@ -596,17 +607,18 @@ define amdgpu_kernel void @ds12align2(ptr addrspace(3) %in, ptr addrspace(3) %ou ; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; ALIGNED-SDAG-NEXT: ds_read_u16 v2, v0 ; ALIGNED-SDAG-NEXT: ds_read_u16 v3, v0 offset:8 ; ALIGNED-SDAG-NEXT: ds_read_u16 v4, v0 offset:4 +; ALIGNED-SDAG-NEXT: ds_read_u16 v2, v0 ; ALIGNED-SDAG-NEXT: ds_read_u16 v5, v0 offset:2 ; ALIGNED-SDAG-NEXT: ds_read_u16 v6, v0 offset:6 ; ALIGNED-SDAG-NEXT: ds_read_u16 v0, v0 offset:10 ; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v1, s1 -; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(4) +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(5) ; ALIGNED-SDAG-NEXT: ds_write_b16 v1, v3 offset:8 -; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(4) +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(5) ; ALIGNED-SDAG-NEXT: ds_write_b16 v1, v4 offset:4 +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(5) ; ALIGNED-SDAG-NEXT: ds_write_b16 v1, v2 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(5) ; ALIGNED-SDAG-NEXT: ds_write_b16 v1, v5 offset:2 @@ -799,37 +811,42 @@ define amdgpu_kernel void @ds16align1(ptr addrspace(3) %in, ptr addrspace(3) %ou ; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; ALIGNED-SDAG-NEXT: ds_read_u8 v13, v0 offset:12 +; ALIGNED-SDAG-NEXT: ds_read_u8 v14, v0 offset:13 ; ALIGNED-SDAG-NEXT: ds_read_u8 v1, v0 ; ALIGNED-SDAG-NEXT: ds_read_u8 v2, v0 offset:1 -; ALIGNED-SDAG-NEXT: ds_read_u8 v3, v0 offset:2 -; ALIGNED-SDAG-NEXT: ds_read_u8 v4, v0 offset:3 ; ALIGNED-SDAG-NEXT: ds_read_u8 v5, v0 offset:4 ; ALIGNED-SDAG-NEXT: ds_read_u8 v6, v0 offset:5 -; ALIGNED-SDAG-NEXT: ds_read_u8 v7, v0 offset:6 -; ALIGNED-SDAG-NEXT: ds_read_u8 v8, v0 offset:7 ; ALIGNED-SDAG-NEXT: ds_read_u8 v9, v0 offset:8 ; ALIGNED-SDAG-NEXT: ds_read_u8 v10, v0 offset:9 +; ALIGNED-SDAG-NEXT: ds_read_u8 v3, v0 offset:2 +; ALIGNED-SDAG-NEXT: ds_read_u8 v4, v0 offset:3 +; ALIGNED-SDAG-NEXT: ds_read_u8 v7, v0 offset:6 +; ALIGNED-SDAG-NEXT: ds_read_u8 v8, v0 offset:7 ; ALIGNED-SDAG-NEXT: ds_read_u8 v11, v0 offset:10 ; ALIGNED-SDAG-NEXT: ds_read_u8 v12, v0 offset:11 -; ALIGNED-SDAG-NEXT: ds_read_u8 v13, v0 offset:12 -; ALIGNED-SDAG-NEXT: ds_read_u8 v14, v0 offset:13 ; ALIGNED-SDAG-NEXT: ds_read_u8 v15, v0 offset:14 ; ALIGNED-SDAG-NEXT: ds_read_u8 v0, v0 offset:15 ; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v16, s1 -; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(14) ; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v13 offset:12 -; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) ; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v14 offset:13 +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(14) ; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v1 ; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v2 offset:1 +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(14) ; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v5 offset:4 ; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v6 offset:5 +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(14) ; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v9 offset:8 ; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v10 offset:9 +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(14) ; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v3 offset:2 ; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v4 offset:3 +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(14) ; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v7 offset:6 ; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v8 offset:7 +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(14) ; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v11 offset:10 ; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v12 offset:11 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(14) @@ -982,22 +999,26 @@ define amdgpu_kernel void @ds16align2(ptr addrspace(3) %in, ptr addrspace(3) %ou ; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; ALIGNED-SDAG-NEXT: ds_read_u16 v5, v0 offset:12 ; ALIGNED-SDAG-NEXT: ds_read_u16 v2, v0 -; ALIGNED-SDAG-NEXT: ds_read_u16 v3, v0 offset:2 ; ALIGNED-SDAG-NEXT: ds_read_u16 v4, v0 offset:4 -; ALIGNED-SDAG-NEXT: ds_read_u16 v5, v0 offset:12 -; ALIGNED-SDAG-NEXT: ds_read_u16 v6, v0 offset:6 ; ALIGNED-SDAG-NEXT: ds_read_u16 v7, v0 offset:8 +; ALIGNED-SDAG-NEXT: ds_read_u16 v3, v0 offset:2 +; ALIGNED-SDAG-NEXT: ds_read_u16 v6, v0 offset:6 ; ALIGNED-SDAG-NEXT: ds_read_u16 v8, v0 offset:10 ; ALIGNED-SDAG-NEXT: ds_read_u16 v0, v0 offset:14 ; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v1, s1 -; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(4) +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(7) ; ALIGNED-SDAG-NEXT: ds_write_b16 v1, v5 offset:12 +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(7) ; ALIGNED-SDAG-NEXT: ds_write_b16 v1, v2 +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(7) ; ALIGNED-SDAG-NEXT: ds_write_b16 v1, v4 offset:4 -; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(5) +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(7) ; ALIGNED-SDAG-NEXT: ds_write_b16 v1, v7 offset:8 +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(7) ; ALIGNED-SDAG-NEXT: ds_write_b16 v1, v3 offset:2 +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(7) ; ALIGNED-SDAG-NEXT: ds_write_b16 v1, v6 offset:6 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(7) ; ALIGNED-SDAG-NEXT: ds_write_b16 v1, v8 offset:10 diff --git a/llvm/test/CodeGen/AMDGPU/ds_read2.ll b/llvm/test/CodeGen/AMDGPU/ds_read2.ll index 9f1b55ea3b1ef..260d98b6371c9 100644 --- a/llvm/test/CodeGen/AMDGPU/ds_read2.ll +++ b/llvm/test/CodeGen/AMDGPU/ds_read2.ll @@ -530,19 +530,20 @@ define amdgpu_kernel void @unaligned_read2_f32(ptr addrspace(1) %out, ptr addrsp ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_add_i32_e32 v1, vcc, s2, v0 ; CI-NEXT: ds_read_u8 v2, v1 offset:1 -; CI-NEXT: ds_read_u8 v3, v1 offset:34 -; CI-NEXT: ds_read_u8 v4, v1 offset:32 -; CI-NEXT: ds_read_u8 v5, v1 offset:2 ; CI-NEXT: ds_read_u8 v6, v1 ; CI-NEXT: ds_read_u8 v7, v1 offset:3 +; CI-NEXT: ds_read_u8 v5, v1 offset:2 ; CI-NEXT: ds_read_u8 v8, v1 offset:33 +; CI-NEXT: ds_read_u8 v4, v1 offset:32 +; CI-NEXT: ds_read_u8 v3, v1 offset:34 ; CI-NEXT: ds_read_u8 v1, v1 offset:35 ; CI-NEXT: s_waitcnt lgkmcnt(7) ; CI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; CI-NEXT: s_waitcnt lgkmcnt(3) +; CI-NEXT: s_waitcnt lgkmcnt(6) ; CI-NEXT: v_or_b32_e32 v2, v2, v6 -; CI-NEXT: s_waitcnt lgkmcnt(2) +; CI-NEXT: s_waitcnt lgkmcnt(5) ; CI-NEXT: v_lshlrev_b32_e32 v6, 8, v7 +; CI-NEXT: s_waitcnt lgkmcnt(4) ; CI-NEXT: v_or_b32_e32 v5, v6, v5 ; CI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -623,19 +624,20 @@ define amdgpu_kernel void @unaligned_offset_read2_f32(ptr addrspace(1) %out, ptr ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_add_i32_e32 v1, vcc, s2, v0 ; CI-NEXT: ds_read_u8 v2, v1 offset:6 -; CI-NEXT: ds_read_u8 v3, v1 offset:11 -; CI-NEXT: ds_read_u8 v4, v1 offset:9 -; CI-NEXT: ds_read_u8 v5, v1 offset:7 ; CI-NEXT: ds_read_u8 v6, v1 offset:5 ; CI-NEXT: ds_read_u8 v7, v1 offset:8 +; CI-NEXT: ds_read_u8 v5, v1 offset:7 ; CI-NEXT: ds_read_u8 v8, v1 offset:10 +; CI-NEXT: ds_read_u8 v4, v1 offset:9 +; CI-NEXT: ds_read_u8 v3, v1 offset:11 ; CI-NEXT: ds_read_u8 v1, v1 offset:12 ; CI-NEXT: s_waitcnt lgkmcnt(7) ; CI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; CI-NEXT: s_waitcnt lgkmcnt(3) +; CI-NEXT: s_waitcnt lgkmcnt(6) ; CI-NEXT: v_or_b32_e32 v2, v2, v6 -; CI-NEXT: s_waitcnt lgkmcnt(2) +; CI-NEXT: s_waitcnt lgkmcnt(5) ; CI-NEXT: v_lshlrev_b32_e32 v6, 8, v7 +; CI-NEXT: s_waitcnt lgkmcnt(4) ; CI-NEXT: v_or_b32_e32 v5, v6, v5 ; CI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -716,8 +718,8 @@ define amdgpu_kernel void @misaligned_2_simple_read2_f32(ptr addrspace(1) %out, ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_add_i32_e32 v1, vcc, s0, v0 ; CI-NEXT: ds_read_u16 v2, v1 offset:2 -; CI-NEXT: ds_read_u16 v3, v1 offset:32 ; CI-NEXT: ds_read_u16 v4, v1 +; CI-NEXT: ds_read_u16 v3, v1 offset:32 ; CI-NEXT: ds_read_u16 v1, v1 offset:34 ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -1454,21 +1456,21 @@ define amdgpu_kernel void @read2_v2i32_align1_odd_offset(ptr addrspace(1) %out) ; CI-NEXT: v_mov_b32_e32 v0, 0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: ds_read_u8 v1, v0 offset:70 +; CI-NEXT: ds_read_u8 v4, v0 offset:69 ; CI-NEXT: ds_read_u8 v2, v0 offset:72 ; CI-NEXT: ds_read_u8 v3, v0 offset:71 -; CI-NEXT: ds_read_u8 v4, v0 offset:69 ; CI-NEXT: ds_read_u8 v5, v0 offset:68 ; CI-NEXT: s_waitcnt lgkmcnt(4) ; CI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; CI-NEXT: s_waitcnt lgkmcnt(3) -; CI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; CI-NEXT: s_waitcnt lgkmcnt(2) -; CI-NEXT: v_or_b32_e32 v2, v2, v3 -; CI-NEXT: s_waitcnt lgkmcnt(1) ; CI-NEXT: v_or_b32_e32 v1, v1, v4 ; CI-NEXT: ds_read_u8 v4, v0 offset:66 ; CI-NEXT: ds_read_u8 v6, v0 offset:67 ; CI-NEXT: ds_read_u8 v0, v0 offset:65 +; CI-NEXT: s_waitcnt lgkmcnt(5) +; CI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; CI-NEXT: s_waitcnt lgkmcnt(4) +; CI-NEXT: v_or_b32_e32 v2, v2, v3 ; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_or_b32_e32 v1, v2, v1 @@ -1487,14 +1489,14 @@ define amdgpu_kernel void @read2_v2i32_align1_odd_offset(ptr addrspace(1) %out) ; GFX9-ALIGNED-LABEL: read2_v2i32_align1_odd_offset: ; GFX9-ALIGNED: ; %bb.0: ; %entry ; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-ALIGNED-NEXT: ds_read_u8 v0, v2 offset:65 -; GFX9-ALIGNED-NEXT: ds_read_u8 v3, v2 offset:66 -; GFX9-ALIGNED-NEXT: ds_read_u8 v4, v2 offset:67 -; GFX9-ALIGNED-NEXT: ds_read_u8 v5, v2 offset:68 ; GFX9-ALIGNED-NEXT: ds_read_u8 v1, v2 offset:70 ; GFX9-ALIGNED-NEXT: ds_read_u8 v6, v2 offset:69 ; GFX9-ALIGNED-NEXT: ds_read_u8 v7, v2 offset:72 ; GFX9-ALIGNED-NEXT: ds_read_u8 v8, v2 offset:71 +; GFX9-ALIGNED-NEXT: ds_read_u8 v3, v2 offset:66 +; GFX9-ALIGNED-NEXT: ds_read_u8 v0, v2 offset:65 +; GFX9-ALIGNED-NEXT: ds_read_u8 v5, v2 offset:68 +; GFX9-ALIGNED-NEXT: ds_read_u8 v4, v2 offset:67 ; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v1, 8, v1 diff --git a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll index 52bcaed7ec75a..f55de32728561 100644 --- a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll +++ b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll @@ -154,15 +154,15 @@ define i32 @global_load_2xi16_align1(ptr addrspace(1) %p) #0 { ; GFX7-ALIGNED-NEXT: v_add_i32_e32 v6, vcc, 3, v0 ; GFX7-ALIGNED-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc ; GFX7-ALIGNED-NEXT: flat_load_ubyte v4, v[4:5] -; GFX7-ALIGNED-NEXT: flat_load_ubyte v5, v[6:7] ; GFX7-ALIGNED-NEXT: flat_load_ubyte v2, v[2:3] +; GFX7-ALIGNED-NEXT: flat_load_ubyte v5, v[6:7] ; GFX7-ALIGNED-NEXT: flat_load_ubyte v0, v[0:1] ; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(3) ; GFX7-ALIGNED-NEXT: v_lshlrev_b32_e32 v1, 8, v4 ; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(2) -; GFX7-ALIGNED-NEXT: v_lshlrev_b32_e32 v3, 24, v5 -; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(1) ; GFX7-ALIGNED-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(1) +; GFX7-ALIGNED-NEXT: v_lshlrev_b32_e32 v3, 24, v5 ; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0) ; GFX7-ALIGNED-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-ALIGNED-NEXT: v_or_b32_e32 v1, v3, v2 diff --git a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll index 6f8da57e223e5..44493498bdbe9 100644 --- a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll +++ b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll @@ -205,15 +205,15 @@ define i32 @private_load_2xi16_align1(ptr addrspace(5) %p) #0 { ; GFX7-ALIGNED-NEXT: v_add_i32_e32 v2, vcc, 1, v0 ; GFX7-ALIGNED-NEXT: v_add_i32_e32 v3, vcc, 3, v0 ; GFX7-ALIGNED-NEXT: buffer_load_ubyte v2, v2, s[0:3], 0 offen -; GFX7-ALIGNED-NEXT: buffer_load_ubyte v3, v3, s[0:3], 0 offen ; GFX7-ALIGNED-NEXT: buffer_load_ubyte v1, v1, s[0:3], 0 offen +; GFX7-ALIGNED-NEXT: buffer_load_ubyte v3, v3, s[0:3], 0 offen ; GFX7-ALIGNED-NEXT: buffer_load_ubyte v0, v0, s[0:3], 0 offen ; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(3) ; GFX7-ALIGNED-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(2) -; GFX7-ALIGNED-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(1) ; GFX7-ALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(1) +; GFX7-ALIGNED-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0) ; GFX7-ALIGNED-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX7-ALIGNED-NEXT: v_or_b32_e32 v1, v3, v1 diff --git a/llvm/test/CodeGen/AMDGPU/freeze.ll b/llvm/test/CodeGen/AMDGPU/freeze.ll index 308e86bbaf8fd..1efa0eeb94b87 100644 --- a/llvm/test/CodeGen/AMDGPU/freeze.ll +++ b/llvm/test/CodeGen/AMDGPU/freeze.ll @@ -2890,19 +2890,21 @@ define void @freeze_v19i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) { ; GFX6-GISEL-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-GISEL-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 ; GFX6-GISEL-NEXT: buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:16 -; GFX6-GISEL-NEXT: buffer_load_dwordx2 v[20:21], v[0:1], s[4:7], 0 addr64 offset:64 -; GFX6-GISEL-NEXT: buffer_load_dword v22, v[0:1], s[4:7], 0 addr64 offset:72 ; GFX6-GISEL-NEXT: buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:32 ; GFX6-GISEL-NEXT: buffer_load_dwordx4 v[16:19], v[0:1], s[4:7], 0 addr64 offset:48 +; GFX6-GISEL-NEXT: buffer_load_dwordx2 v[20:21], v[0:1], s[4:7], 0 addr64 offset:64 +; GFX6-GISEL-NEXT: buffer_load_dword v22, v[0:1], s[4:7], 0 addr64 offset:72 ; GFX6-GISEL-NEXT: s_waitcnt vmcnt(5) ; GFX6-GISEL-NEXT: buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64 ; GFX6-GISEL-NEXT: s_waitcnt vmcnt(5) ; GFX6-GISEL-NEXT: buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:16 -; GFX6-GISEL-NEXT: s_waitcnt vmcnt(3) +; GFX6-GISEL-NEXT: s_waitcnt vmcnt(5) ; GFX6-GISEL-NEXT: buffer_store_dwordx4 v[12:15], v[2:3], s[4:7], 0 addr64 offset:32 -; GFX6-GISEL-NEXT: s_waitcnt vmcnt(3) +; GFX6-GISEL-NEXT: s_waitcnt vmcnt(5) ; GFX6-GISEL-NEXT: buffer_store_dwordx4 v[16:19], v[2:3], s[4:7], 0 addr64 offset:48 +; GFX6-GISEL-NEXT: s_waitcnt vmcnt(5) ; GFX6-GISEL-NEXT: buffer_store_dwordx2 v[20:21], v[2:3], s[4:7], 0 addr64 offset:64 +; GFX6-GISEL-NEXT: s_waitcnt vmcnt(5) ; GFX6-GISEL-NEXT: buffer_store_dword v22, v[2:3], s[4:7], 0 addr64 offset:72 ; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-GISEL-NEXT: s_setpc_b64 s[30:31] @@ -4316,28 +4318,30 @@ define void @freeze_v31i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) { ; GFX6-GISEL-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-GISEL-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 ; GFX6-GISEL-NEXT: buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:16 -; GFX6-GISEL-NEXT: buffer_load_dword v34, v[0:1], s[4:7], 0 addr64 offset:120 ; GFX6-GISEL-NEXT: buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:32 ; GFX6-GISEL-NEXT: buffer_load_dwordx4 v[16:19], v[0:1], s[4:7], 0 addr64 offset:48 ; GFX6-GISEL-NEXT: buffer_load_dwordx4 v[20:23], v[0:1], s[4:7], 0 addr64 offset:64 ; GFX6-GISEL-NEXT: buffer_load_dwordx4 v[24:27], v[0:1], s[4:7], 0 addr64 offset:80 -; GFX6-GISEL-NEXT: buffer_load_dwordx2 v[32:33], v[0:1], s[4:7], 0 addr64 offset:112 ; GFX6-GISEL-NEXT: buffer_load_dwordx4 v[28:31], v[0:1], s[4:7], 0 addr64 offset:96 +; GFX6-GISEL-NEXT: buffer_load_dwordx2 v[32:33], v[0:1], s[4:7], 0 addr64 offset:112 +; GFX6-GISEL-NEXT: buffer_load_dword v34, v[0:1], s[4:7], 0 addr64 offset:120 ; GFX6-GISEL-NEXT: s_waitcnt vmcnt(8) ; GFX6-GISEL-NEXT: buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64 ; GFX6-GISEL-NEXT: s_waitcnt vmcnt(8) ; GFX6-GISEL-NEXT: buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:16 -; GFX6-GISEL-NEXT: s_waitcnt vmcnt(7) +; GFX6-GISEL-NEXT: s_waitcnt vmcnt(8) ; GFX6-GISEL-NEXT: buffer_store_dwordx4 v[12:15], v[2:3], s[4:7], 0 addr64 offset:32 -; GFX6-GISEL-NEXT: s_waitcnt vmcnt(7) +; GFX6-GISEL-NEXT: s_waitcnt vmcnt(8) ; GFX6-GISEL-NEXT: buffer_store_dwordx4 v[16:19], v[2:3], s[4:7], 0 addr64 offset:48 -; GFX6-GISEL-NEXT: s_waitcnt vmcnt(7) +; GFX6-GISEL-NEXT: s_waitcnt vmcnt(8) ; GFX6-GISEL-NEXT: buffer_store_dwordx4 v[20:23], v[2:3], s[4:7], 0 addr64 offset:64 -; GFX6-GISEL-NEXT: s_waitcnt vmcnt(7) +; GFX6-GISEL-NEXT: s_waitcnt vmcnt(8) ; GFX6-GISEL-NEXT: buffer_store_dwordx4 v[24:27], v[2:3], s[4:7], 0 addr64 offset:80 -; GFX6-GISEL-NEXT: s_waitcnt vmcnt(6) +; GFX6-GISEL-NEXT: s_waitcnt vmcnt(8) ; GFX6-GISEL-NEXT: buffer_store_dwordx4 v[28:31], v[2:3], s[4:7], 0 addr64 offset:96 +; GFX6-GISEL-NEXT: s_waitcnt vmcnt(8) ; GFX6-GISEL-NEXT: buffer_store_dwordx2 v[32:33], v[2:3], s[4:7], 0 addr64 offset:112 +; GFX6-GISEL-NEXT: s_waitcnt vmcnt(8) ; GFX6-GISEL-NEXT: buffer_store_dword v34, v[2:3], s[4:7], 0 addr64 offset:120 ; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-GISEL-NEXT: s_setpc_b64 s[30:31] @@ -10380,18 +10384,18 @@ define void @freeze_v8p3(ptr addrspace(3) %ptra, ptr addrspace(3) %ptrb) { ; GFX6-SDAG-NEXT: v_add_i32_e32 v2, vcc, 24, v0 ; GFX6-SDAG-NEXT: v_add_i32_e32 v4, vcc, 16, v0 ; GFX6-SDAG-NEXT: s_mov_b32 m0, -1 -; GFX6-SDAG-NEXT: ds_read_b64 v[2:3], v2 -; GFX6-SDAG-NEXT: ds_read_b64 v[4:5], v4 ; GFX6-SDAG-NEXT: ds_read_b64 v[6:7], v0 +; GFX6-SDAG-NEXT: ds_read_b64 v[4:5], v4 +; GFX6-SDAG-NEXT: ds_read_b64 v[2:3], v2 ; GFX6-SDAG-NEXT: v_add_i32_e32 v0, vcc, 8, v0 ; GFX6-SDAG-NEXT: ds_read_b64 v[8:9], v0 ; GFX6-SDAG-NEXT: v_add_i32_e32 v0, vcc, 16, v1 ; GFX6-SDAG-NEXT: s_waitcnt lgkmcnt(2) ; GFX6-SDAG-NEXT: ds_write_b64 v0, v[4:5] ; GFX6-SDAG-NEXT: v_add_i32_e32 v0, vcc, 24, v1 +; GFX6-SDAG-NEXT: s_waitcnt lgkmcnt(2) ; GFX6-SDAG-NEXT: ds_write_b64 v0, v[2:3] ; GFX6-SDAG-NEXT: v_add_i32_e32 v0, vcc, 8, v1 -; GFX6-SDAG-NEXT: s_waitcnt lgkmcnt(3) ; GFX6-SDAG-NEXT: ds_write_b64 v1, v[6:7] ; GFX6-SDAG-NEXT: s_waitcnt lgkmcnt(3) ; GFX6-SDAG-NEXT: ds_write_b64 v0, v[8:9] @@ -10538,31 +10542,35 @@ define void @freeze_v16p3(ptr addrspace(3) %ptra, ptr addrspace(3) %ptrb) { ; GFX6-SDAG-NEXT: v_add_i32_e32 v16, vcc, 56, v0 ; GFX6-SDAG-NEXT: v_add_i32_e32 v14, vcc, 48, v0 ; GFX6-SDAG-NEXT: s_mov_b32 m0, -1 -; GFX6-SDAG-NEXT: ds_read_b64 v[2:3], v2 -; GFX6-SDAG-NEXT: ds_read_b64 v[4:5], v4 -; GFX6-SDAG-NEXT: ds_read_b64 v[6:7], v6 ; GFX6-SDAG-NEXT: ds_read_b64 v[8:9], v0 -; GFX6-SDAG-NEXT: ds_read_b64 v[10:11], v10 -; GFX6-SDAG-NEXT: ds_read_b64 v[12:13], v12 ; GFX6-SDAG-NEXT: ds_read_b64 v[14:15], v14 ; GFX6-SDAG-NEXT: ds_read_b64 v[16:17], v16 +; GFX6-SDAG-NEXT: ds_read_b64 v[12:13], v12 +; GFX6-SDAG-NEXT: ds_read_b64 v[10:11], v10 +; GFX6-SDAG-NEXT: ds_read_b64 v[6:7], v6 +; GFX6-SDAG-NEXT: ds_read_b64 v[4:5], v4 +; GFX6-SDAG-NEXT: ds_read_b64 v[2:3], v2 ; GFX6-SDAG-NEXT: v_add_i32_e32 v0, vcc, 48, v1 -; GFX6-SDAG-NEXT: s_waitcnt lgkmcnt(4) -; GFX6-SDAG-NEXT: ds_write_b64 v1, v[8:9] -; GFX6-SDAG-NEXT: s_waitcnt lgkmcnt(2) +; GFX6-SDAG-NEXT: s_waitcnt lgkmcnt(6) ; GFX6-SDAG-NEXT: ds_write_b64 v0, v[14:15] ; GFX6-SDAG-NEXT: v_add_i32_e32 v0, vcc, 56, v1 -; GFX6-SDAG-NEXT: s_waitcnt lgkmcnt(2) +; GFX6-SDAG-NEXT: s_waitcnt lgkmcnt(6) ; GFX6-SDAG-NEXT: ds_write_b64 v0, v[16:17] ; GFX6-SDAG-NEXT: v_add_i32_e32 v0, vcc, 32, v1 +; GFX6-SDAG-NEXT: s_waitcnt lgkmcnt(6) ; GFX6-SDAG-NEXT: ds_write_b64 v0, v[12:13] ; GFX6-SDAG-NEXT: v_add_i32_e32 v0, vcc, 40, v1 +; GFX6-SDAG-NEXT: s_waitcnt lgkmcnt(6) ; GFX6-SDAG-NEXT: ds_write_b64 v0, v[10:11] ; GFX6-SDAG-NEXT: v_add_i32_e32 v0, vcc, 16, v1 +; GFX6-SDAG-NEXT: s_waitcnt lgkmcnt(6) ; GFX6-SDAG-NEXT: ds_write_b64 v0, v[6:7] ; GFX6-SDAG-NEXT: v_add_i32_e32 v0, vcc, 24, v1 +; GFX6-SDAG-NEXT: s_waitcnt lgkmcnt(6) ; GFX6-SDAG-NEXT: ds_write_b64 v0, v[4:5] ; GFX6-SDAG-NEXT: v_add_i32_e32 v0, vcc, 8, v1 +; GFX6-SDAG-NEXT: ds_write_b64 v1, v[8:9] +; GFX6-SDAG-NEXT: s_waitcnt lgkmcnt(7) ; GFX6-SDAG-NEXT: ds_write_b64 v0, v[2:3] ; GFX6-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -10934,14 +10942,15 @@ define void @freeze_v3p5(ptr addrspace(5) %ptra, ptr addrspace(5) %ptrb) { ; GFX6-SDAG: ; %bb.0: ; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-SDAG-NEXT: v_add_i32_e32 v2, vcc, 4, v0 -; GFX6-SDAG-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen ; GFX6-SDAG-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen +; GFX6-SDAG-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen ; GFX6-SDAG-NEXT: v_add_i32_e32 v0, vcc, 8, v0 ; GFX6-SDAG-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen ; GFX6-SDAG-NEXT: v_add_i32_e32 v4, vcc, 4, v1 ; GFX6-SDAG-NEXT: v_add_i32_e32 v5, vcc, 8, v1 -; GFX6-SDAG-NEXT: s_waitcnt vmcnt(1) +; GFX6-SDAG-NEXT: s_waitcnt vmcnt(2) ; GFX6-SDAG-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GFX6-SDAG-NEXT: s_waitcnt vmcnt(2) ; GFX6-SDAG-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen ; GFX6-SDAG-NEXT: s_waitcnt vmcnt(2) ; GFX6-SDAG-NEXT: buffer_store_dword v0, v5, s[0:3], 0 offen @@ -10971,14 +10980,15 @@ define void @freeze_v3p5(ptr addrspace(5) %ptra, ptr addrspace(5) %ptrb) { ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-SDAG-NEXT: v_add_i32_e32 v2, vcc, 4, v0 -; GFX7-SDAG-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen ; GFX7-SDAG-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen +; GFX7-SDAG-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen ; GFX7-SDAG-NEXT: v_add_i32_e32 v0, vcc, 8, v0 ; GFX7-SDAG-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen ; GFX7-SDAG-NEXT: v_add_i32_e32 v4, vcc, 4, v1 ; GFX7-SDAG-NEXT: v_add_i32_e32 v5, vcc, 8, v1 -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(1) +; GFX7-SDAG-NEXT: s_waitcnt vmcnt(2) ; GFX7-SDAG-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GFX7-SDAG-NEXT: s_waitcnt vmcnt(2) ; GFX7-SDAG-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(2) ; GFX7-SDAG-NEXT: buffer_store_dword v0, v5, s[0:3], 0 offen @@ -11072,17 +11082,19 @@ define void @freeze_v4p5(ptr addrspace(5) %ptra, ptr addrspace(5) %ptrb) { ; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-SDAG-NEXT: v_add_i32_e32 v2, vcc, 8, v0 ; GFX6-SDAG-NEXT: v_add_i32_e32 v3, vcc, 4, v0 -; GFX6-SDAG-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen -; GFX6-SDAG-NEXT: buffer_load_dword v3, v3, s[0:3], 0 offen ; GFX6-SDAG-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GFX6-SDAG-NEXT: buffer_load_dword v3, v3, s[0:3], 0 offen +; GFX6-SDAG-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen ; GFX6-SDAG-NEXT: v_add_i32_e32 v0, vcc, 12, v0 ; GFX6-SDAG-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen ; GFX6-SDAG-NEXT: v_add_i32_e32 v5, vcc, 4, v1 ; GFX6-SDAG-NEXT: v_add_i32_e32 v6, vcc, 8, v1 ; GFX6-SDAG-NEXT: v_add_i32_e32 v7, vcc, 12, v1 -; GFX6-SDAG-NEXT: s_waitcnt vmcnt(1) +; GFX6-SDAG-NEXT: s_waitcnt vmcnt(3) ; GFX6-SDAG-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen +; GFX6-SDAG-NEXT: s_waitcnt vmcnt(3) ; GFX6-SDAG-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; GFX6-SDAG-NEXT: s_waitcnt vmcnt(3) ; GFX6-SDAG-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen ; GFX6-SDAG-NEXT: s_waitcnt vmcnt(3) ; GFX6-SDAG-NEXT: buffer_store_dword v0, v7, s[0:3], 0 offen @@ -11118,17 +11130,19 @@ define void @freeze_v4p5(ptr addrspace(5) %ptra, ptr addrspace(5) %ptrb) { ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-SDAG-NEXT: v_add_i32_e32 v2, vcc, 8, v0 ; GFX7-SDAG-NEXT: v_add_i32_e32 v3, vcc, 4, v0 -; GFX7-SDAG-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen -; GFX7-SDAG-NEXT: buffer_load_dword v3, v3, s[0:3], 0 offen ; GFX7-SDAG-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GFX7-SDAG-NEXT: buffer_load_dword v3, v3, s[0:3], 0 offen +; GFX7-SDAG-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen ; GFX7-SDAG-NEXT: v_add_i32_e32 v0, vcc, 12, v0 ; GFX7-SDAG-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen ; GFX7-SDAG-NEXT: v_add_i32_e32 v5, vcc, 4, v1 ; GFX7-SDAG-NEXT: v_add_i32_e32 v6, vcc, 8, v1 ; GFX7-SDAG-NEXT: v_add_i32_e32 v7, vcc, 12, v1 -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(1) +; GFX7-SDAG-NEXT: s_waitcnt vmcnt(3) ; GFX7-SDAG-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen +; GFX7-SDAG-NEXT: s_waitcnt vmcnt(3) ; GFX7-SDAG-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; GFX7-SDAG-NEXT: s_waitcnt vmcnt(3) ; GFX7-SDAG-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(3) ; GFX7-SDAG-NEXT: buffer_store_dword v0, v7, s[0:3], 0 offen @@ -11242,13 +11256,13 @@ define void @freeze_v8p5(ptr addrspace(5) %ptra, ptr addrspace(5) %ptrb) { ; GFX6-SDAG-NEXT: v_add_i32_e32 v5, vcc, 12, v0 ; GFX6-SDAG-NEXT: v_add_i32_e32 v6, vcc, 8, v0 ; GFX6-SDAG-NEXT: v_add_i32_e32 v7, vcc, 4, v0 -; GFX6-SDAG-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen -; GFX6-SDAG-NEXT: buffer_load_dword v3, v3, s[0:3], 0 offen -; GFX6-SDAG-NEXT: buffer_load_dword v4, v4, s[0:3], 0 offen -; GFX6-SDAG-NEXT: buffer_load_dword v5, v5, s[0:3], 0 offen -; GFX6-SDAG-NEXT: buffer_load_dword v6, v6, s[0:3], 0 offen -; GFX6-SDAG-NEXT: buffer_load_dword v7, v7, s[0:3], 0 offen ; GFX6-SDAG-NEXT: buffer_load_dword v8, v0, s[0:3], 0 offen +; GFX6-SDAG-NEXT: buffer_load_dword v7, v7, s[0:3], 0 offen +; GFX6-SDAG-NEXT: buffer_load_dword v6, v6, s[0:3], 0 offen +; GFX6-SDAG-NEXT: buffer_load_dword v5, v5, s[0:3], 0 offen +; GFX6-SDAG-NEXT: buffer_load_dword v4, v4, s[0:3], 0 offen +; GFX6-SDAG-NEXT: buffer_load_dword v3, v3, s[0:3], 0 offen +; GFX6-SDAG-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen ; GFX6-SDAG-NEXT: v_add_i32_e32 v0, vcc, 28, v0 ; GFX6-SDAG-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen ; GFX6-SDAG-NEXT: v_add_i32_e32 v9, vcc, 4, v1 @@ -11258,13 +11272,19 @@ define void @freeze_v8p5(ptr addrspace(5) %ptra, ptr addrspace(5) %ptrb) { ; GFX6-SDAG-NEXT: v_add_i32_e32 v13, vcc, 20, v1 ; GFX6-SDAG-NEXT: v_add_i32_e32 v14, vcc, 24, v1 ; GFX6-SDAG-NEXT: v_add_i32_e32 v15, vcc, 28, v1 -; GFX6-SDAG-NEXT: s_waitcnt vmcnt(1) +; GFX6-SDAG-NEXT: s_waitcnt vmcnt(7) ; GFX6-SDAG-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen +; GFX6-SDAG-NEXT: s_waitcnt vmcnt(7) ; GFX6-SDAG-NEXT: buffer_store_dword v7, v9, s[0:3], 0 offen +; GFX6-SDAG-NEXT: s_waitcnt vmcnt(7) ; GFX6-SDAG-NEXT: buffer_store_dword v6, v10, s[0:3], 0 offen +; GFX6-SDAG-NEXT: s_waitcnt vmcnt(7) ; GFX6-SDAG-NEXT: buffer_store_dword v5, v11, s[0:3], 0 offen +; GFX6-SDAG-NEXT: s_waitcnt vmcnt(7) ; GFX6-SDAG-NEXT: buffer_store_dword v4, v12, s[0:3], 0 offen +; GFX6-SDAG-NEXT: s_waitcnt vmcnt(7) ; GFX6-SDAG-NEXT: buffer_store_dword v3, v13, s[0:3], 0 offen +; GFX6-SDAG-NEXT: s_waitcnt vmcnt(7) ; GFX6-SDAG-NEXT: buffer_store_dword v2, v14, s[0:3], 0 offen ; GFX6-SDAG-NEXT: s_waitcnt vmcnt(7) ; GFX6-SDAG-NEXT: buffer_store_dword v0, v15, s[0:3], 0 offen @@ -11324,13 +11344,13 @@ define void @freeze_v8p5(ptr addrspace(5) %ptra, ptr addrspace(5) %ptrb) { ; GFX7-SDAG-NEXT: v_add_i32_e32 v5, vcc, 12, v0 ; GFX7-SDAG-NEXT: v_add_i32_e32 v6, vcc, 8, v0 ; GFX7-SDAG-NEXT: v_add_i32_e32 v7, vcc, 4, v0 -; GFX7-SDAG-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen -; GFX7-SDAG-NEXT: buffer_load_dword v3, v3, s[0:3], 0 offen -; GFX7-SDAG-NEXT: buffer_load_dword v4, v4, s[0:3], 0 offen -; GFX7-SDAG-NEXT: buffer_load_dword v5, v5, s[0:3], 0 offen -; GFX7-SDAG-NEXT: buffer_load_dword v6, v6, s[0:3], 0 offen -; GFX7-SDAG-NEXT: buffer_load_dword v7, v7, s[0:3], 0 offen ; GFX7-SDAG-NEXT: buffer_load_dword v8, v0, s[0:3], 0 offen +; GFX7-SDAG-NEXT: buffer_load_dword v7, v7, s[0:3], 0 offen +; GFX7-SDAG-NEXT: buffer_load_dword v6, v6, s[0:3], 0 offen +; GFX7-SDAG-NEXT: buffer_load_dword v5, v5, s[0:3], 0 offen +; GFX7-SDAG-NEXT: buffer_load_dword v4, v4, s[0:3], 0 offen +; GFX7-SDAG-NEXT: buffer_load_dword v3, v3, s[0:3], 0 offen +; GFX7-SDAG-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen ; GFX7-SDAG-NEXT: v_add_i32_e32 v0, vcc, 28, v0 ; GFX7-SDAG-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen ; GFX7-SDAG-NEXT: v_add_i32_e32 v9, vcc, 4, v1 @@ -11340,13 +11360,19 @@ define void @freeze_v8p5(ptr addrspace(5) %ptra, ptr addrspace(5) %ptrb) { ; GFX7-SDAG-NEXT: v_add_i32_e32 v13, vcc, 20, v1 ; GFX7-SDAG-NEXT: v_add_i32_e32 v14, vcc, 24, v1 ; GFX7-SDAG-NEXT: v_add_i32_e32 v15, vcc, 28, v1 -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(1) +; GFX7-SDAG-NEXT: s_waitcnt vmcnt(7) ; GFX7-SDAG-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen +; GFX7-SDAG-NEXT: s_waitcnt vmcnt(7) ; GFX7-SDAG-NEXT: buffer_store_dword v7, v9, s[0:3], 0 offen +; GFX7-SDAG-NEXT: s_waitcnt vmcnt(7) ; GFX7-SDAG-NEXT: buffer_store_dword v6, v10, s[0:3], 0 offen +; GFX7-SDAG-NEXT: s_waitcnt vmcnt(7) ; GFX7-SDAG-NEXT: buffer_store_dword v5, v11, s[0:3], 0 offen +; GFX7-SDAG-NEXT: s_waitcnt vmcnt(7) ; GFX7-SDAG-NEXT: buffer_store_dword v4, v12, s[0:3], 0 offen +; GFX7-SDAG-NEXT: s_waitcnt vmcnt(7) ; GFX7-SDAG-NEXT: buffer_store_dword v3, v13, s[0:3], 0 offen +; GFX7-SDAG-NEXT: s_waitcnt vmcnt(7) ; GFX7-SDAG-NEXT: buffer_store_dword v2, v14, s[0:3], 0 offen ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(7) ; GFX7-SDAG-NEXT: buffer_store_dword v0, v15, s[0:3], 0 offen @@ -11538,10 +11564,10 @@ define void @freeze_v16p5(ptr addrspace(5) %ptra, ptr addrspace(5) %ptrb) { ; GFX6-SDAG-NEXT: v_add_i32_e32 v6, vcc, 12, v0 ; GFX6-SDAG-NEXT: v_add_i32_e32 v7, vcc, 8, v0 ; GFX6-SDAG-NEXT: v_add_i32_e32 v8, vcc, 4, v0 -; GFX6-SDAG-NEXT: buffer_load_dword v5, v5, s[0:3], 0 offen -; GFX6-SDAG-NEXT: buffer_load_dword v6, v6, s[0:3], 0 offen -; GFX6-SDAG-NEXT: buffer_load_dword v7, v7, s[0:3], 0 offen ; GFX6-SDAG-NEXT: buffer_load_dword v8, v8, s[0:3], 0 offen +; GFX6-SDAG-NEXT: buffer_load_dword v7, v7, s[0:3], 0 offen +; GFX6-SDAG-NEXT: buffer_load_dword v6, v6, s[0:3], 0 offen +; GFX6-SDAG-NEXT: buffer_load_dword v5, v5, s[0:3], 0 offen ; GFX6-SDAG-NEXT: v_add_i32_e32 v2, vcc, 56, v0 ; GFX6-SDAG-NEXT: v_add_i32_e32 v3, vcc, 52, v0 ; GFX6-SDAG-NEXT: v_add_i32_e32 v4, vcc, 48, v0 @@ -11552,53 +11578,54 @@ define void @freeze_v16p5(ptr addrspace(5) %ptra, ptr addrspace(5) %ptrb) { ; GFX6-SDAG-NEXT: v_add_i32_e32 v13, vcc, 28, v0 ; GFX6-SDAG-NEXT: v_add_i32_e32 v14, vcc, 24, v0 ; GFX6-SDAG-NEXT: v_add_i32_e32 v15, vcc, 20, v0 -; GFX6-SDAG-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen -; GFX6-SDAG-NEXT: buffer_load_dword v3, v3, s[0:3], 0 offen -; GFX6-SDAG-NEXT: buffer_load_dword v4, v4, s[0:3], 0 offen -; GFX6-SDAG-NEXT: buffer_load_dword v9, v9, s[0:3], 0 offen -; GFX6-SDAG-NEXT: buffer_load_dword v10, v10, s[0:3], 0 offen -; GFX6-SDAG-NEXT: buffer_load_dword v11, v11, s[0:3], 0 offen ; GFX6-SDAG-NEXT: buffer_load_dword v16, v0, s[0:3], 0 offen ; GFX6-SDAG-NEXT: buffer_load_dword v15, v15, s[0:3], 0 offen ; GFX6-SDAG-NEXT: buffer_load_dword v14, v14, s[0:3], 0 offen ; GFX6-SDAG-NEXT: buffer_load_dword v13, v13, s[0:3], 0 offen ; GFX6-SDAG-NEXT: buffer_load_dword v12, v12, s[0:3], 0 offen +; GFX6-SDAG-NEXT: buffer_load_dword v11, v11, s[0:3], 0 offen +; GFX6-SDAG-NEXT: buffer_load_dword v10, v10, s[0:3], 0 offen +; GFX6-SDAG-NEXT: buffer_load_dword v9, v9, s[0:3], 0 offen +; GFX6-SDAG-NEXT: buffer_load_dword v4, v4, s[0:3], 0 offen +; GFX6-SDAG-NEXT: buffer_load_dword v3, v3, s[0:3], 0 offen +; GFX6-SDAG-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen ; GFX6-SDAG-NEXT: v_add_i32_e32 v0, vcc, 60, v0 ; GFX6-SDAG-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen ; GFX6-SDAG-NEXT: v_add_i32_e32 v17, vcc, 4, v1 ; GFX6-SDAG-NEXT: v_add_i32_e32 v18, vcc, 8, v1 ; GFX6-SDAG-NEXT: v_add_i32_e32 v19, vcc, 12, v1 ; GFX6-SDAG-NEXT: s_waitcnt vmcnt(14) -; GFX6-SDAG-NEXT: buffer_store_dword v6, v19, s[0:3], 0 offen -; GFX6-SDAG-NEXT: s_waitcnt vmcnt(14) -; GFX6-SDAG-NEXT: buffer_store_dword v7, v18, s[0:3], 0 offen -; GFX6-SDAG-NEXT: s_waitcnt vmcnt(14) ; GFX6-SDAG-NEXT: buffer_store_dword v8, v17, s[0:3], 0 offen ; GFX6-SDAG-NEXT: s_waitcnt expcnt(0) ; GFX6-SDAG-NEXT: v_add_i32_e32 v8, vcc, 16, v1 +; GFX6-SDAG-NEXT: s_waitcnt vmcnt(13) ; GFX6-SDAG-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen ; GFX6-SDAG-NEXT: s_waitcnt expcnt(0) ; GFX6-SDAG-NEXT: v_add_i32_e32 v5, vcc, 40, v1 ; GFX6-SDAG-NEXT: v_add_i32_e32 v17, vcc, 20, v1 +; GFX6-SDAG-NEXT: buffer_store_dword v7, v18, s[0:3], 0 offen +; GFX6-SDAG-NEXT: s_waitcnt expcnt(0) ; GFX6-SDAG-NEXT: v_add_i32_e32 v7, vcc, 24, v1 ; GFX6-SDAG-NEXT: v_add_i32_e32 v18, vcc, 28, v1 +; GFX6-SDAG-NEXT: buffer_store_dword v6, v19, s[0:3], 0 offen +; GFX6-SDAG-NEXT: s_waitcnt expcnt(0) ; GFX6-SDAG-NEXT: v_add_i32_e32 v6, vcc, 32, v1 ; GFX6-SDAG-NEXT: v_add_i32_e32 v19, vcc, 36, v1 ; GFX6-SDAG-NEXT: v_add_i32_e32 v8, vcc, 44, v1 -; GFX6-SDAG-NEXT: s_waitcnt vmcnt(9) +; GFX6-SDAG-NEXT: s_waitcnt vmcnt(14) ; GFX6-SDAG-NEXT: buffer_store_dword v16, v1, s[0:3], 0 offen -; GFX6-SDAG-NEXT: s_waitcnt vmcnt(9) ; GFX6-SDAG-NEXT: buffer_store_dword v15, v17, s[0:3], 0 offen -; GFX6-SDAG-NEXT: s_waitcnt vmcnt(9) +; GFX6-SDAG-NEXT: s_waitcnt vmcnt(14) ; GFX6-SDAG-NEXT: buffer_store_dword v14, v7, s[0:3], 0 offen -; GFX6-SDAG-NEXT: s_waitcnt vmcnt(9) ; GFX6-SDAG-NEXT: buffer_store_dword v13, v18, s[0:3], 0 offen -; GFX6-SDAG-NEXT: s_waitcnt vmcnt(9) +; GFX6-SDAG-NEXT: s_waitcnt vmcnt(14) ; GFX6-SDAG-NEXT: buffer_store_dword v12, v6, s[0:3], 0 offen ; GFX6-SDAG-NEXT: buffer_store_dword v11, v19, s[0:3], 0 offen +; GFX6-SDAG-NEXT: s_waitcnt vmcnt(14) ; GFX6-SDAG-NEXT: buffer_store_dword v10, v5, s[0:3], 0 offen ; GFX6-SDAG-NEXT: buffer_store_dword v9, v8, s[0:3], 0 offen ; GFX6-SDAG-NEXT: v_add_i32_e32 v5, vcc, 48, v1 +; GFX6-SDAG-NEXT: s_waitcnt vmcnt(14) ; GFX6-SDAG-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen ; GFX6-SDAG-NEXT: s_waitcnt expcnt(0) ; GFX6-SDAG-NEXT: v_add_i32_e32 v4, vcc, 52, v1 @@ -11606,8 +11633,8 @@ define void @freeze_v16p5(ptr addrspace(5) %ptra, ptr addrspace(5) %ptrb) { ; GFX6-SDAG-NEXT: s_waitcnt expcnt(0) ; GFX6-SDAG-NEXT: v_add_i32_e32 v3, vcc, 56, v1 ; GFX6-SDAG-NEXT: v_add_i32_e32 v1, vcc, 60, v1 -; GFX6-SDAG-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen ; GFX6-SDAG-NEXT: s_waitcnt vmcnt(14) +; GFX6-SDAG-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen ; GFX6-SDAG-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -11629,8 +11656,8 @@ define void @freeze_v16p5(ptr addrspace(5) %ptra, ptr addrspace(5) %ptrb) { ; GFX6-GISEL-NEXT: v_add_i32_e32 v12, vcc, 40, v0 ; GFX6-GISEL-NEXT: v_add_i32_e32 v13, vcc, 44, v0 ; GFX6-GISEL-NEXT: buffer_load_dword v4, v4, s[0:3], 0 offen -; GFX6-GISEL-NEXT: buffer_load_dword v9, v0, s[0:3], 0 offen ; GFX6-GISEL-NEXT: buffer_load_dword v5, v5, s[0:3], 0 offen +; GFX6-GISEL-NEXT: buffer_load_dword v9, v0, s[0:3], 0 offen ; GFX6-GISEL-NEXT: buffer_load_dword v6, v6, s[0:3], 0 offen ; GFX6-GISEL-NEXT: buffer_load_dword v7, v7, s[0:3], 0 offen ; GFX6-GISEL-NEXT: buffer_load_dword v8, v8, s[0:3], 0 offen @@ -11660,35 +11687,34 @@ define void @freeze_v16p5(ptr addrspace(5) %ptra, ptr addrspace(5) %ptrb) { ; GFX6-GISEL-NEXT: v_add_i32_e32 v18, vcc, 28, v1 ; GFX6-GISEL-NEXT: s_waitcnt vmcnt(14) ; GFX6-GISEL-NEXT: buffer_store_dword v4, v19, s[0:3], 0 offen -; GFX6-GISEL-NEXT: s_waitcnt expcnt(0) -; GFX6-GISEL-NEXT: v_add_i32_e32 v4, vcc, 32, v1 -; GFX6-GISEL-NEXT: s_waitcnt vmcnt(14) ; GFX6-GISEL-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen ; GFX6-GISEL-NEXT: v_add_i32_e32 v2, vcc, 40, v1 +; GFX6-GISEL-NEXT: s_waitcnt expcnt(1) +; GFX6-GISEL-NEXT: v_add_i32_e32 v4, vcc, 32, v1 ; GFX6-GISEL-NEXT: v_add_i32_e32 v19, vcc, 36, v1 ; GFX6-GISEL-NEXT: s_waitcnt expcnt(0) ; GFX6-GISEL-NEXT: v_add_i32_e32 v5, vcc, 44, v1 -; GFX6-GISEL-NEXT: buffer_store_dword v9, v1, s[0:3], 0 offen ; GFX6-GISEL-NEXT: s_waitcnt vmcnt(14) +; GFX6-GISEL-NEXT: buffer_store_dword v9, v1, s[0:3], 0 offen ; GFX6-GISEL-NEXT: buffer_store_dword v6, v17, s[0:3], 0 offen -; GFX6-GISEL-NEXT: buffer_store_dword v7, v3, s[0:3], 0 offen ; GFX6-GISEL-NEXT: s_waitcnt vmcnt(14) +; GFX6-GISEL-NEXT: buffer_store_dword v7, v3, s[0:3], 0 offen ; GFX6-GISEL-NEXT: buffer_store_dword v8, v18, s[0:3], 0 offen -; GFX6-GISEL-NEXT: buffer_store_dword v10, v4, s[0:3], 0 offen ; GFX6-GISEL-NEXT: s_waitcnt vmcnt(14) +; GFX6-GISEL-NEXT: buffer_store_dword v10, v4, s[0:3], 0 offen ; GFX6-GISEL-NEXT: buffer_store_dword v11, v19, s[0:3], 0 offen -; GFX6-GISEL-NEXT: buffer_store_dword v12, v2, s[0:3], 0 offen ; GFX6-GISEL-NEXT: s_waitcnt vmcnt(14) +; GFX6-GISEL-NEXT: buffer_store_dword v12, v2, s[0:3], 0 offen ; GFX6-GISEL-NEXT: buffer_store_dword v13, v5, s[0:3], 0 offen ; GFX6-GISEL-NEXT: v_add_i32_e32 v2, vcc, 48, v1 +; GFX6-GISEL-NEXT: s_waitcnt vmcnt(14) ; GFX6-GISEL-NEXT: buffer_store_dword v14, v2, s[0:3], 0 offen ; GFX6-GISEL-NEXT: v_add_i32_e32 v2, vcc, 52, v1 -; GFX6-GISEL-NEXT: s_waitcnt vmcnt(14) ; GFX6-GISEL-NEXT: buffer_store_dword v15, v2, s[0:3], 0 offen ; GFX6-GISEL-NEXT: v_add_i32_e32 v2, vcc, 56, v1 ; GFX6-GISEL-NEXT: v_add_i32_e32 v1, vcc, 60, v1 -; GFX6-GISEL-NEXT: buffer_store_dword v16, v2, s[0:3], 0 offen ; GFX6-GISEL-NEXT: s_waitcnt vmcnt(14) +; GFX6-GISEL-NEXT: buffer_store_dword v16, v2, s[0:3], 0 offen ; GFX6-GISEL-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-GISEL-NEXT: s_setpc_b64 s[30:31] @@ -11700,10 +11726,10 @@ define void @freeze_v16p5(ptr addrspace(5) %ptra, ptr addrspace(5) %ptrb) { ; GFX7-SDAG-NEXT: v_add_i32_e32 v6, vcc, 12, v0 ; GFX7-SDAG-NEXT: v_add_i32_e32 v7, vcc, 8, v0 ; GFX7-SDAG-NEXT: v_add_i32_e32 v8, vcc, 4, v0 -; GFX7-SDAG-NEXT: buffer_load_dword v5, v5, s[0:3], 0 offen -; GFX7-SDAG-NEXT: buffer_load_dword v6, v6, s[0:3], 0 offen -; GFX7-SDAG-NEXT: buffer_load_dword v7, v7, s[0:3], 0 offen ; GFX7-SDAG-NEXT: buffer_load_dword v8, v8, s[0:3], 0 offen +; GFX7-SDAG-NEXT: buffer_load_dword v7, v7, s[0:3], 0 offen +; GFX7-SDAG-NEXT: buffer_load_dword v6, v6, s[0:3], 0 offen +; GFX7-SDAG-NEXT: buffer_load_dword v5, v5, s[0:3], 0 offen ; GFX7-SDAG-NEXT: v_add_i32_e32 v2, vcc, 56, v0 ; GFX7-SDAG-NEXT: v_add_i32_e32 v3, vcc, 52, v0 ; GFX7-SDAG-NEXT: v_add_i32_e32 v4, vcc, 48, v0 @@ -11714,58 +11740,57 @@ define void @freeze_v16p5(ptr addrspace(5) %ptra, ptr addrspace(5) %ptrb) { ; GFX7-SDAG-NEXT: v_add_i32_e32 v13, vcc, 28, v0 ; GFX7-SDAG-NEXT: v_add_i32_e32 v14, vcc, 24, v0 ; GFX7-SDAG-NEXT: v_add_i32_e32 v15, vcc, 20, v0 -; GFX7-SDAG-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen -; GFX7-SDAG-NEXT: buffer_load_dword v3, v3, s[0:3], 0 offen -; GFX7-SDAG-NEXT: buffer_load_dword v4, v4, s[0:3], 0 offen -; GFX7-SDAG-NEXT: buffer_load_dword v9, v9, s[0:3], 0 offen -; GFX7-SDAG-NEXT: buffer_load_dword v10, v10, s[0:3], 0 offen -; GFX7-SDAG-NEXT: buffer_load_dword v11, v11, s[0:3], 0 offen ; GFX7-SDAG-NEXT: buffer_load_dword v16, v0, s[0:3], 0 offen ; GFX7-SDAG-NEXT: buffer_load_dword v15, v15, s[0:3], 0 offen ; GFX7-SDAG-NEXT: buffer_load_dword v14, v14, s[0:3], 0 offen ; GFX7-SDAG-NEXT: buffer_load_dword v13, v13, s[0:3], 0 offen ; GFX7-SDAG-NEXT: buffer_load_dword v12, v12, s[0:3], 0 offen +; GFX7-SDAG-NEXT: buffer_load_dword v11, v11, s[0:3], 0 offen +; GFX7-SDAG-NEXT: buffer_load_dword v10, v10, s[0:3], 0 offen +; GFX7-SDAG-NEXT: buffer_load_dword v9, v9, s[0:3], 0 offen +; GFX7-SDAG-NEXT: buffer_load_dword v4, v4, s[0:3], 0 offen +; GFX7-SDAG-NEXT: buffer_load_dword v3, v3, s[0:3], 0 offen +; GFX7-SDAG-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen ; GFX7-SDAG-NEXT: v_add_i32_e32 v0, vcc, 60, v0 ; GFX7-SDAG-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen ; GFX7-SDAG-NEXT: v_add_i32_e32 v17, vcc, 4, v1 ; GFX7-SDAG-NEXT: v_add_i32_e32 v18, vcc, 8, v1 ; GFX7-SDAG-NEXT: v_add_i32_e32 v19, vcc, 12, v1 ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(14) -; GFX7-SDAG-NEXT: buffer_store_dword v6, v19, s[0:3], 0 offen -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(14) -; GFX7-SDAG-NEXT: buffer_store_dword v7, v18, s[0:3], 0 offen -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(14) ; GFX7-SDAG-NEXT: buffer_store_dword v8, v17, s[0:3], 0 offen ; GFX7-SDAG-NEXT: v_add_i32_e32 v8, vcc, 16, v1 +; GFX7-SDAG-NEXT: s_waitcnt vmcnt(13) ; GFX7-SDAG-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen ; GFX7-SDAG-NEXT: v_add_i32_e32 v5, vcc, 40, v1 ; GFX7-SDAG-NEXT: v_add_i32_e32 v17, vcc, 20, v1 +; GFX7-SDAG-NEXT: buffer_store_dword v7, v18, s[0:3], 0 offen ; GFX7-SDAG-NEXT: v_add_i32_e32 v7, vcc, 24, v1 ; GFX7-SDAG-NEXT: v_add_i32_e32 v18, vcc, 28, v1 +; GFX7-SDAG-NEXT: buffer_store_dword v6, v19, s[0:3], 0 offen ; GFX7-SDAG-NEXT: v_add_i32_e32 v6, vcc, 32, v1 ; GFX7-SDAG-NEXT: v_add_i32_e32 v19, vcc, 36, v1 ; GFX7-SDAG-NEXT: v_add_i32_e32 v8, vcc, 44, v1 -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(9) +; GFX7-SDAG-NEXT: s_waitcnt vmcnt(14) ; GFX7-SDAG-NEXT: buffer_store_dword v16, v1, s[0:3], 0 offen -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(9) ; GFX7-SDAG-NEXT: buffer_store_dword v15, v17, s[0:3], 0 offen -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(9) +; GFX7-SDAG-NEXT: s_waitcnt vmcnt(14) ; GFX7-SDAG-NEXT: buffer_store_dword v14, v7, s[0:3], 0 offen -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(9) ; GFX7-SDAG-NEXT: buffer_store_dword v13, v18, s[0:3], 0 offen -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(9) +; GFX7-SDAG-NEXT: s_waitcnt vmcnt(14) ; GFX7-SDAG-NEXT: buffer_store_dword v12, v6, s[0:3], 0 offen ; GFX7-SDAG-NEXT: buffer_store_dword v11, v19, s[0:3], 0 offen +; GFX7-SDAG-NEXT: s_waitcnt vmcnt(14) ; GFX7-SDAG-NEXT: buffer_store_dword v10, v5, s[0:3], 0 offen ; GFX7-SDAG-NEXT: buffer_store_dword v9, v8, s[0:3], 0 offen ; GFX7-SDAG-NEXT: v_add_i32_e32 v5, vcc, 48, v1 +; GFX7-SDAG-NEXT: s_waitcnt vmcnt(14) ; GFX7-SDAG-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen ; GFX7-SDAG-NEXT: v_add_i32_e32 v4, vcc, 52, v1 ; GFX7-SDAG-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen ; GFX7-SDAG-NEXT: v_add_i32_e32 v3, vcc, 56, v1 ; GFX7-SDAG-NEXT: v_add_i32_e32 v1, vcc, 60, v1 -; GFX7-SDAG-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(14) +; GFX7-SDAG-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen ; GFX7-SDAG-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -11787,8 +11812,8 @@ define void @freeze_v16p5(ptr addrspace(5) %ptra, ptr addrspace(5) %ptrb) { ; GFX7-GISEL-NEXT: v_add_i32_e32 v12, vcc, 40, v0 ; GFX7-GISEL-NEXT: v_add_i32_e32 v13, vcc, 44, v0 ; GFX7-GISEL-NEXT: buffer_load_dword v4, v4, s[0:3], 0 offen -; GFX7-GISEL-NEXT: buffer_load_dword v9, v0, s[0:3], 0 offen ; GFX7-GISEL-NEXT: buffer_load_dword v5, v5, s[0:3], 0 offen +; GFX7-GISEL-NEXT: buffer_load_dword v9, v0, s[0:3], 0 offen ; GFX7-GISEL-NEXT: buffer_load_dword v6, v6, s[0:3], 0 offen ; GFX7-GISEL-NEXT: buffer_load_dword v7, v7, s[0:3], 0 offen ; GFX7-GISEL-NEXT: buffer_load_dword v8, v8, s[0:3], 0 offen @@ -11816,33 +11841,32 @@ define void @freeze_v16p5(ptr addrspace(5) %ptra, ptr addrspace(5) %ptrb) { ; GFX7-GISEL-NEXT: v_add_i32_e32 v18, vcc, 28, v1 ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(14) ; GFX7-GISEL-NEXT: buffer_store_dword v4, v19, s[0:3], 0 offen -; GFX7-GISEL-NEXT: v_add_i32_e32 v4, vcc, 32, v1 -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(14) ; GFX7-GISEL-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen ; GFX7-GISEL-NEXT: v_add_i32_e32 v2, vcc, 40, v1 +; GFX7-GISEL-NEXT: v_add_i32_e32 v4, vcc, 32, v1 ; GFX7-GISEL-NEXT: v_add_i32_e32 v19, vcc, 36, v1 ; GFX7-GISEL-NEXT: v_add_i32_e32 v5, vcc, 44, v1 -; GFX7-GISEL-NEXT: buffer_store_dword v9, v1, s[0:3], 0 offen ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(14) +; GFX7-GISEL-NEXT: buffer_store_dword v9, v1, s[0:3], 0 offen ; GFX7-GISEL-NEXT: buffer_store_dword v6, v17, s[0:3], 0 offen -; GFX7-GISEL-NEXT: buffer_store_dword v7, v3, s[0:3], 0 offen ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(14) +; GFX7-GISEL-NEXT: buffer_store_dword v7, v3, s[0:3], 0 offen ; GFX7-GISEL-NEXT: buffer_store_dword v8, v18, s[0:3], 0 offen -; GFX7-GISEL-NEXT: buffer_store_dword v10, v4, s[0:3], 0 offen ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(14) +; GFX7-GISEL-NEXT: buffer_store_dword v10, v4, s[0:3], 0 offen ; GFX7-GISEL-NEXT: buffer_store_dword v11, v19, s[0:3], 0 offen -; GFX7-GISEL-NEXT: buffer_store_dword v12, v2, s[0:3], 0 offen ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(14) +; GFX7-GISEL-NEXT: buffer_store_dword v12, v2, s[0:3], 0 offen ; GFX7-GISEL-NEXT: buffer_store_dword v13, v5, s[0:3], 0 offen ; GFX7-GISEL-NEXT: v_add_i32_e32 v2, vcc, 48, v1 +; GFX7-GISEL-NEXT: s_waitcnt vmcnt(14) ; GFX7-GISEL-NEXT: buffer_store_dword v14, v2, s[0:3], 0 offen ; GFX7-GISEL-NEXT: v_add_i32_e32 v2, vcc, 52, v1 -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(14) ; GFX7-GISEL-NEXT: buffer_store_dword v15, v2, s[0:3], 0 offen ; GFX7-GISEL-NEXT: v_add_i32_e32 v2, vcc, 56, v1 ; GFX7-GISEL-NEXT: v_add_i32_e32 v1, vcc, 60, v1 -; GFX7-GISEL-NEXT: buffer_store_dword v16, v2, s[0:3], 0 offen ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(14) +; GFX7-GISEL-NEXT: buffer_store_dword v16, v2, s[0:3], 0 offen ; GFX7-GISEL-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] @@ -11864,8 +11888,8 @@ define void @freeze_v16p5(ptr addrspace(5) %ptra, ptr addrspace(5) %ptrb) { ; GFX8-GISEL-NEXT: v_add_u32_e32 v12, vcc, 40, v0 ; GFX8-GISEL-NEXT: v_add_u32_e32 v13, vcc, 44, v0 ; GFX8-GISEL-NEXT: buffer_load_dword v4, v4, s[0:3], 0 offen -; GFX8-GISEL-NEXT: buffer_load_dword v9, v0, s[0:3], 0 offen ; GFX8-GISEL-NEXT: buffer_load_dword v5, v5, s[0:3], 0 offen +; GFX8-GISEL-NEXT: buffer_load_dword v9, v0, s[0:3], 0 offen ; GFX8-GISEL-NEXT: buffer_load_dword v6, v6, s[0:3], 0 offen ; GFX8-GISEL-NEXT: buffer_load_dword v7, v7, s[0:3], 0 offen ; GFX8-GISEL-NEXT: buffer_load_dword v8, v8, s[0:3], 0 offen @@ -11893,33 +11917,32 @@ define void @freeze_v16p5(ptr addrspace(5) %ptra, ptr addrspace(5) %ptrb) { ; GFX8-GISEL-NEXT: v_add_u32_e32 v18, vcc, 28, v1 ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(14) ; GFX8-GISEL-NEXT: buffer_store_dword v4, v19, s[0:3], 0 offen -; GFX8-GISEL-NEXT: v_add_u32_e32 v4, vcc, 32, v1 -; GFX8-GISEL-NEXT: s_waitcnt vmcnt(14) ; GFX8-GISEL-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen ; GFX8-GISEL-NEXT: v_add_u32_e32 v2, vcc, 40, v1 +; GFX8-GISEL-NEXT: v_add_u32_e32 v4, vcc, 32, v1 ; GFX8-GISEL-NEXT: v_add_u32_e32 v19, vcc, 36, v1 ; GFX8-GISEL-NEXT: v_add_u32_e32 v5, vcc, 44, v1 -; GFX8-GISEL-NEXT: buffer_store_dword v9, v1, s[0:3], 0 offen ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(14) +; GFX8-GISEL-NEXT: buffer_store_dword v9, v1, s[0:3], 0 offen ; GFX8-GISEL-NEXT: buffer_store_dword v6, v17, s[0:3], 0 offen -; GFX8-GISEL-NEXT: buffer_store_dword v7, v3, s[0:3], 0 offen ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(14) +; GFX8-GISEL-NEXT: buffer_store_dword v7, v3, s[0:3], 0 offen ; GFX8-GISEL-NEXT: buffer_store_dword v8, v18, s[0:3], 0 offen -; GFX8-GISEL-NEXT: buffer_store_dword v10, v4, s[0:3], 0 offen ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(14) +; GFX8-GISEL-NEXT: buffer_store_dword v10, v4, s[0:3], 0 offen ; GFX8-GISEL-NEXT: buffer_store_dword v11, v19, s[0:3], 0 offen -; GFX8-GISEL-NEXT: buffer_store_dword v12, v2, s[0:3], 0 offen ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(14) +; GFX8-GISEL-NEXT: buffer_store_dword v12, v2, s[0:3], 0 offen ; GFX8-GISEL-NEXT: buffer_store_dword v13, v5, s[0:3], 0 offen ; GFX8-GISEL-NEXT: v_add_u32_e32 v2, vcc, 48, v1 +; GFX8-GISEL-NEXT: s_waitcnt vmcnt(14) ; GFX8-GISEL-NEXT: buffer_store_dword v14, v2, s[0:3], 0 offen ; GFX8-GISEL-NEXT: v_add_u32_e32 v2, vcc, 52, v1 -; GFX8-GISEL-NEXT: s_waitcnt vmcnt(14) ; GFX8-GISEL-NEXT: buffer_store_dword v15, v2, s[0:3], 0 offen ; GFX8-GISEL-NEXT: v_add_u32_e32 v2, vcc, 56, v1 ; GFX8-GISEL-NEXT: v_add_u32_e32 v1, vcc, 60, v1 -; GFX8-GISEL-NEXT: buffer_store_dword v16, v2, s[0:3], 0 offen ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(14) +; GFX8-GISEL-NEXT: buffer_store_dword v16, v2, s[0:3], 0 offen ; GFX8-GISEL-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/function-args.ll b/llvm/test/CodeGen/AMDGPU/function-args.ll index 5babe9fb3d851..d86894fb335c7 100644 --- a/llvm/test/CodeGen/AMDGPU/function-args.ll +++ b/llvm/test/CodeGen/AMDGPU/function-args.ll @@ -2631,13 +2631,13 @@ define void @void_func_byval_struct_i8_i32(ptr addrspace(5) byval({ i8, i32 }) % ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: scratch_load_d16_u8 v0, off, s32 ; GFX11-TRUE16-NEXT: scratch_load_b32 v1, off, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_load_d16_u8 v0, off, s32 ; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) ; GFX11-TRUE16-NEXT: buffer_store_b32 v1, off, s[0:3], 0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_store_b8 v0, off, s[0:3], 0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -2835,9 +2835,9 @@ define void @void_func_v32i32_i32_i64(<32 x i32> %arg0, i32 %arg1, i64 %arg2) #0 ; CIGFX89: ; %bb.0: ; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CIGFX89-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; CIGFX89-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:4 ; CIGFX89-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12 ; CIGFX89-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 -; CIGFX89-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:4 ; CIGFX89-NEXT: s_mov_b32 s7, 0xf000 ; CIGFX89-NEXT: s_mov_b32 s6, -1 ; CIGFX89-NEXT: s_waitcnt vmcnt(3) @@ -2868,8 +2868,8 @@ define void @void_func_v32i32_i32_i64(<32 x i32> %arg0, i32 %arg1, i64 %arg2) #0 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_clause 0x3 ; GFX11-NEXT: scratch_load_b32 v31, off, s32 -; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:12 ; GFX11-NEXT: scratch_load_b32 v34, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:12 ; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:8 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, -1 @@ -2890,7 +2890,7 @@ define void @void_func_v32i32_i32_i64(<32 x i32> %arg0, i32 %arg1, i64 %arg2) #0 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: s_waitcnt vmcnt(2) ; GFX11-NEXT: buffer_store_b32 v34, off, s[0:3], 0 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -3241,11 +3241,11 @@ define void @void_func_v32i32_v2i16_v2f16_v2bf16_v4bf16(<32 x i32> %arg0, <2 x i ; GFX89: ; %bb.0: ; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX89-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX89-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20 -; GFX89-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16 ; GFX89-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:4 ; GFX89-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:8 ; GFX89-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:12 +; GFX89-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20 +; GFX89-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16 ; GFX89-NEXT: s_mov_b32 s7, 0xf000 ; GFX89-NEXT: s_mov_b32 s6, -1 ; GFX89-NEXT: s_waitcnt vmcnt(5) @@ -3280,10 +3280,10 @@ define void @void_func_v32i32_v2i16_v2f16_v2bf16_v4bf16(<32 x i32> %arg0, <2 x i ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_clause 0x5 ; GFX11-NEXT: scratch_load_b32 v31, off, s32 -; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:20 ; GFX11-NEXT: scratch_load_b32 v34, off, s32 offset:4 ; GFX11-NEXT: scratch_load_b32 v35, off, s32 offset:8 ; GFX11-NEXT: scratch_load_b32 v36, off, s32 offset:12 +; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:20 ; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:16 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, -1 @@ -3304,13 +3304,13 @@ define void @void_func_v32i32_v2i16_v2f16_v2bf16_v4bf16(<32 x i32> %arg0, <2 x i ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_waitcnt vmcnt(3) +; GFX11-NEXT: s_waitcnt vmcnt(4) ; GFX11-NEXT: buffer_store_b32 v34, off, s[0:3], 0 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_waitcnt vmcnt(2) +; GFX11-NEXT: s_waitcnt vmcnt(3) ; GFX11-NEXT: buffer_store_b32 v35, off, s[0:3], 0 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: s_waitcnt vmcnt(2) ; GFX11-NEXT: buffer_store_b32 v36, off, s[0:3], 0 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -3330,13 +3330,13 @@ define void @void_func_v32i32_v2i64_v2f64(<32 x i32> %arg0, <2 x i64> %arg1, <2 ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; CI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:32 -; CI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:28 -; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24 ; CI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16 ; CI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:12 ; CI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:8 ; CI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:4 +; CI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:32 +; CI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:28 +; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt vmcnt(7) @@ -3367,13 +3367,13 @@ define void @void_func_v32i32_v2i64_v2f64(<32 x i32> %arg0, <2 x i64> %arg1, <2 ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:32 -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:28 -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24 ; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16 ; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:12 ; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:8 ; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt vmcnt(7) @@ -3404,13 +3404,13 @@ define void @void_func_v32i32_v2i64_v2f64(<32 x i32> %arg0, <2 x i64> %arg1, <2 ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:32 -; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:28 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24 ; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16 ; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:12 ; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:8 ; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt vmcnt(7) @@ -3443,13 +3443,13 @@ define void @void_func_v32i32_v2i64_v2f64(<32 x i32> %arg0, <2 x i64> %arg1, <2 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_clause 0x8 ; GFX11-NEXT: scratch_load_b32 v31, off, s32 -; GFX11-NEXT: scratch_load_b32 v35, off, s32 offset:32 -; GFX11-NEXT: scratch_load_b32 v34, off, s32 offset:28 -; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:24 ; GFX11-NEXT: scratch_load_b32 v39, off, s32 offset:16 ; GFX11-NEXT: scratch_load_b32 v38, off, s32 offset:12 ; GFX11-NEXT: scratch_load_b32 v37, off, s32 offset:8 ; GFX11-NEXT: scratch_load_b32 v36, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v35, off, s32 offset:32 +; GFX11-NEXT: scratch_load_b32 v34, off, s32 offset:28 +; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:24 ; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:20 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, -1 @@ -3470,7 +3470,7 @@ define void @void_func_v32i32_v2i64_v2f64(<32 x i32> %arg0, <2 x i64> %arg1, <2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: s_waitcnt vmcnt(4) ; GFX11-NEXT: buffer_store_b128 v[36:39], off, s[0:3], 0 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -3573,13 +3573,13 @@ define void @void_func_v32i32_v8i32_v8f32(<32 x i32> %arg0, <8 x i32> %arg1, <8 ; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:32 +; CI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:28 +; CI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:24 ; CI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:16 ; CI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12 ; CI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; CI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:32 -; CI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:28 -; CI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:24 ; CI-NEXT: s_waitcnt vmcnt(7) ; CI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) @@ -3592,13 +3592,13 @@ define void @void_func_v32i32_v8i32_v8f32(<32 x i32> %arg0, <8 x i32> %arg1, <8 ; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:20 -; CI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:48 -; CI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:44 -; CI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:40 ; CI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:64 ; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:60 ; CI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:56 ; CI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:52 +; CI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:48 +; CI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:44 +; CI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:40 ; CI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:36 ; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) @@ -3622,13 +3622,13 @@ define void @void_func_v32i32_v8i32_v8f32(<32 x i32> %arg0, <8 x i32> %arg1, <8 ; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:24 ; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:16 ; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12 ; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:32 -; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:28 -; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:24 ; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -3641,13 +3641,13 @@ define void @void_func_v32i32_v8i32_v8f32(<32 x i32> %arg0, <8 x i32> %arg1, <8 ; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:48 -; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:40 ; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:64 ; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:60 ; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:56 ; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:48 +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:40 ; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:36 ; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -3671,13 +3671,13 @@ define void @void_func_v32i32_v8i32_v8f32(<32 x i32> %arg0, <8 x i32> %arg1, <8 ; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:24 ; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:16 ; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12 ; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:32 -; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:28 -; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:24 ; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -3690,13 +3690,13 @@ define void @void_func_v32i32_v8i32_v8f32(<32 x i32> %arg0, <8 x i32> %arg1, <8 ; GFX9-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:48 -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:44 -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:40 ; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:64 ; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:60 ; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:56 ; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:52 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:48 +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:40 ; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:36 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 @@ -3719,26 +3719,26 @@ define void @void_func_v32i32_v8i32_v8f32(<32 x i32> %arg0, <8 x i32> %arg1, <8 ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_clause 0x10 -; GFX11-NEXT: scratch_load_b32 v35, off, s32 offset:48 -; GFX11-NEXT: scratch_load_b32 v34, off, s32 offset:44 -; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:40 -; GFX11-NEXT: scratch_load_b32 v39, off, s32 offset:64 -; GFX11-NEXT: scratch_load_b32 v38, off, s32 offset:60 ; GFX11-NEXT: scratch_load_b32 v31, off, s32 -; GFX11-NEXT: scratch_load_b32 v37, off, s32 offset:56 -; GFX11-NEXT: scratch_load_b32 v51, off, s32 offset:16 -; GFX11-NEXT: scratch_load_b32 v50, off, s32 offset:12 -; GFX11-NEXT: scratch_load_b32 v49, off, s32 offset:8 ; GFX11-NEXT: scratch_load_b32 v55, off, s32 offset:32 ; GFX11-NEXT: scratch_load_b32 v54, off, s32 offset:28 ; GFX11-NEXT: scratch_load_b32 v53, off, s32 offset:24 ; GFX11-NEXT: scratch_load_b32 v52, off, s32 offset:20 +; GFX11-NEXT: scratch_load_b32 v51, off, s32 offset:16 +; GFX11-NEXT: scratch_load_b32 v50, off, s32 offset:12 +; GFX11-NEXT: scratch_load_b32 v49, off, s32 offset:8 ; GFX11-NEXT: scratch_load_b32 v48, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v39, off, s32 offset:64 +; GFX11-NEXT: scratch_load_b32 v38, off, s32 offset:60 +; GFX11-NEXT: scratch_load_b32 v37, off, s32 offset:56 ; GFX11-NEXT: scratch_load_b32 v36, off, s32 offset:52 +; GFX11-NEXT: scratch_load_b32 v35, off, s32 offset:48 +; GFX11-NEXT: scratch_load_b32 v34, off, s32 offset:44 +; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:40 ; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:36 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, -1 -; GFX11-NEXT: s_waitcnt vmcnt(11) +; GFX11-NEXT: s_waitcnt vmcnt(16) ; GFX11-NEXT: buffer_store_b128 v[28:31], off, s[0:3], 0 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_store_b128 v[24:27], off, s[0:3], 0 dlc @@ -3755,13 +3755,13 @@ define void @void_func_v32i32_v8i32_v8f32(<32 x i32> %arg0, <8 x i32> %arg1, <8 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_waitcnt vmcnt(3) +; GFX11-NEXT: s_waitcnt vmcnt(12) ; GFX11-NEXT: buffer_store_b128 v[52:55], off, s[0:3], 0 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_waitcnt vmcnt(2) +; GFX11-NEXT: s_waitcnt vmcnt(8) ; GFX11-NEXT: buffer_store_b128 v[48:51], off, s[0:3], 0 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: s_waitcnt vmcnt(4) ; GFX11-NEXT: buffer_store_b128 v[36:39], off, s[0:3], 0 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -3812,25 +3812,25 @@ define void @void_func_v32i32_v16i32_v16f32(<32 x i32> %arg0, <16 x i32> %arg1, ; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4 +; CI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:128 +; CI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:124 +; CI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:120 ; CI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:112 ; CI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:108 ; CI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:104 ; CI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:100 -; CI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:128 -; CI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:124 -; CI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:120 ; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_dwordx4 v[32:35], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:116 -; CI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 -; CI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 -; CI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:72 ; CI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:96 ; CI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:92 ; CI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:88 ; CI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:84 +; CI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 +; CI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 +; CI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:72 ; CI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:68 ; CI-NEXT: buffer_store_dwordx4 v[36:39], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) @@ -3885,25 +3885,25 @@ define void @void_func_v32i32_v16i32_v16f32(<32 x i32> %arg0, <16 x i32> %arg1, ; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:128 +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:124 +; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:120 ; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:112 ; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:108 ; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:104 ; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:100 -; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:128 -; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:124 -; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:120 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dwordx4 v[32:35], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:116 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:72 ; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:96 ; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:92 ; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:88 ; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:84 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:72 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:68 ; VI-NEXT: buffer_store_dwordx4 v[36:39], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -3959,26 +3959,26 @@ define void @void_func_v32i32_v16i32_v16f32(<32 x i32> %arg0, <16 x i32> %arg1, ; GFX9-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:128 +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:124 +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:120 ; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:112 ; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:108 ; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:104 ; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:100 -; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:128 -; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:124 -; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:120 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dwordx4 v[32:35], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:116 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:72 ; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:96 ; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:92 ; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:88 ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:84 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:72 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:68 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dwordx4 v[36:39], off, s[4:7], 0 @@ -4001,42 +4001,42 @@ define void @void_func_v32i32_v16i32_v16f32(<32 x i32> %arg0, <16 x i32> %arg1, ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_load_b32 v35, off, s32 offset:80 -; GFX11-NEXT: scratch_load_b32 v34, off, s32 offset:76 -; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:72 -; GFX11-NEXT: scratch_load_b32 v39, off, s32 offset:96 -; GFX11-NEXT: scratch_load_b32 v38, off, s32 offset:92 -; GFX11-NEXT: scratch_load_b32 v37, off, s32 offset:88 -; GFX11-NEXT: scratch_load_b32 v51, off, s32 offset:112 -; GFX11-NEXT: scratch_load_b32 v50, off, s32 offset:108 -; GFX11-NEXT: scratch_load_b32 v49, off, s32 offset:104 -; GFX11-NEXT: scratch_load_b32 v55, off, s32 offset:128 -; GFX11-NEXT: scratch_load_b32 v54, off, s32 offset:124 -; GFX11-NEXT: scratch_load_b32 v53, off, s32 offset:120 -; GFX11-NEXT: scratch_load_b32 v67, off, s32 offset:16 -; GFX11-NEXT: scratch_load_b32 v66, off, s32 offset:12 -; GFX11-NEXT: scratch_load_b32 v65, off, s32 offset:8 -; GFX11-NEXT: scratch_load_b32 v71, off, s32 offset:32 -; GFX11-NEXT: scratch_load_b32 v70, off, s32 offset:28 ; GFX11-NEXT: scratch_load_b32 v31, off, s32 -; GFX11-NEXT: scratch_load_b32 v69, off, s32 offset:24 -; GFX11-NEXT: scratch_load_b32 v83, off, s32 offset:48 -; GFX11-NEXT: scratch_load_b32 v82, off, s32 offset:44 -; GFX11-NEXT: scratch_load_b32 v81, off, s32 offset:40 ; GFX11-NEXT: scratch_load_b32 v87, off, s32 offset:64 ; GFX11-NEXT: scratch_load_b32 v86, off, s32 offset:60 ; GFX11-NEXT: scratch_load_b32 v85, off, s32 offset:56 ; GFX11-NEXT: scratch_load_b32 v84, off, s32 offset:52 +; GFX11-NEXT: scratch_load_b32 v83, off, s32 offset:48 +; GFX11-NEXT: scratch_load_b32 v82, off, s32 offset:44 +; GFX11-NEXT: scratch_load_b32 v81, off, s32 offset:40 ; GFX11-NEXT: scratch_load_b32 v80, off, s32 offset:36 +; GFX11-NEXT: scratch_load_b32 v71, off, s32 offset:32 +; GFX11-NEXT: scratch_load_b32 v70, off, s32 offset:28 +; GFX11-NEXT: scratch_load_b32 v69, off, s32 offset:24 ; GFX11-NEXT: scratch_load_b32 v68, off, s32 offset:20 +; GFX11-NEXT: scratch_load_b32 v67, off, s32 offset:16 +; GFX11-NEXT: scratch_load_b32 v66, off, s32 offset:12 +; GFX11-NEXT: scratch_load_b32 v65, off, s32 offset:8 ; GFX11-NEXT: scratch_load_b32 v64, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v55, off, s32 offset:128 +; GFX11-NEXT: scratch_load_b32 v54, off, s32 offset:124 +; GFX11-NEXT: scratch_load_b32 v53, off, s32 offset:120 ; GFX11-NEXT: scratch_load_b32 v52, off, s32 offset:116 +; GFX11-NEXT: scratch_load_b32 v51, off, s32 offset:112 +; GFX11-NEXT: scratch_load_b32 v50, off, s32 offset:108 +; GFX11-NEXT: scratch_load_b32 v49, off, s32 offset:104 ; GFX11-NEXT: scratch_load_b32 v48, off, s32 offset:100 +; GFX11-NEXT: scratch_load_b32 v39, off, s32 offset:96 +; GFX11-NEXT: scratch_load_b32 v38, off, s32 offset:92 +; GFX11-NEXT: scratch_load_b32 v37, off, s32 offset:88 ; GFX11-NEXT: scratch_load_b32 v36, off, s32 offset:84 +; GFX11-NEXT: scratch_load_b32 v35, off, s32 offset:80 +; GFX11-NEXT: scratch_load_b32 v34, off, s32 offset:76 +; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:72 ; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:68 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, -1 -; GFX11-NEXT: s_waitcnt vmcnt(15) +; GFX11-NEXT: s_waitcnt vmcnt(32) ; GFX11-NEXT: buffer_store_b128 v[28:31], off, s[0:3], 0 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_store_b128 v[24:27], off, s[0:3], 0 dlc @@ -4053,25 +4053,25 @@ define void @void_func_v32i32_v16i32_v16f32(<32 x i32> %arg0, <16 x i32> %arg1, ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_waitcnt vmcnt(7) +; GFX11-NEXT: s_waitcnt vmcnt(28) ; GFX11-NEXT: buffer_store_b128 v[84:87], off, s[0:3], 0 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_waitcnt vmcnt(6) +; GFX11-NEXT: s_waitcnt vmcnt(24) ; GFX11-NEXT: buffer_store_b128 v[80:83], off, s[0:3], 0 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_waitcnt vmcnt(5) +; GFX11-NEXT: s_waitcnt vmcnt(20) ; GFX11-NEXT: buffer_store_b128 v[68:71], off, s[0:3], 0 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_waitcnt vmcnt(4) +; GFX11-NEXT: s_waitcnt vmcnt(16) ; GFX11-NEXT: buffer_store_b128 v[64:67], off, s[0:3], 0 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_waitcnt vmcnt(3) +; GFX11-NEXT: s_waitcnt vmcnt(12) ; GFX11-NEXT: buffer_store_b128 v[52:55], off, s[0:3], 0 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_waitcnt vmcnt(2) +; GFX11-NEXT: s_waitcnt vmcnt(8) ; GFX11-NEXT: buffer_store_b128 v[48:51], off, s[0:3], 0 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: s_waitcnt vmcnt(4) ; GFX11-NEXT: buffer_store_b128 v[36:39], off, s[0:3], 0 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -4280,13 +4280,13 @@ define void @void_func_v32i32_v16i8(<32 x i32> %arg0, <16 x i8> %arg1) #0 { ; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, -1 -; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:48 -; CI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:60 ; CI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:64 -; CI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:52 +; CI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:60 ; CI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:56 -; CI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:36 +; CI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:52 +; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:48 ; CI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:40 +; CI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:36 ; CI-NEXT: s_waitcnt vmcnt(7) ; CI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) @@ -4353,13 +4353,13 @@ define void @void_func_v32i32_v16i8(<32 x i32> %arg0, <16 x i8> %arg1) #0 { ; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 -; VI-NEXT: buffer_load_ubyte v32, off, s[0:3], s32 offset:48 -; VI-NEXT: buffer_load_ubyte v33, off, s[0:3], s32 offset:60 ; VI-NEXT: buffer_load_ubyte v34, off, s[0:3], s32 offset:64 -; VI-NEXT: buffer_load_ubyte v35, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_ubyte v33, off, s[0:3], s32 offset:60 ; VI-NEXT: buffer_load_ubyte v36, off, s[0:3], s32 offset:56 -; VI-NEXT: buffer_load_ubyte v37, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ubyte v35, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_ubyte v32, off, s[0:3], s32 offset:48 ; VI-NEXT: buffer_load_ubyte v38, off, s[0:3], s32 offset:40 +; VI-NEXT: buffer_load_ubyte v37, off, s[0:3], s32 offset:36 ; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -4426,13 +4426,13 @@ define void @void_func_v32i32_v16i8(<32 x i32> %arg0, <16 x i8> %arg1) #0 { ; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 -; GFX9-NEXT: buffer_load_ubyte v32, off, s[0:3], s32 offset:48 -; GFX9-NEXT: buffer_load_ubyte v33, off, s[0:3], s32 offset:60 ; GFX9-NEXT: buffer_load_ubyte v34, off, s[0:3], s32 offset:64 -; GFX9-NEXT: buffer_load_ubyte v35, off, s[0:3], s32 offset:52 +; GFX9-NEXT: buffer_load_ubyte v33, off, s[0:3], s32 offset:60 ; GFX9-NEXT: buffer_load_ubyte v36, off, s[0:3], s32 offset:56 -; GFX9-NEXT: buffer_load_ubyte v37, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ubyte v35, off, s[0:3], s32 offset:52 +; GFX9-NEXT: buffer_load_ubyte v32, off, s[0:3], s32 offset:48 ; GFX9-NEXT: buffer_load_ubyte v38, off, s[0:3], s32 offset:40 +; GFX9-NEXT: buffer_load_ubyte v37, off, s[0:3], s32 offset:36 ; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll index b750d28ffa7d3..1054c7b792eb8 100644 --- a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll @@ -2395,27 +2395,27 @@ define amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val) #1 { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_clause 0x7 -; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64 -; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:68 -; GFX10-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:72 -; GFX10-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:76 -; GFX10-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:80 -; GFX10-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:84 -; GFX10-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:88 ; GFX10-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:92 +; GFX10-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:88 +; GFX10-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:84 +; GFX10-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:80 +; GFX10-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:76 +; GFX10-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:72 +; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:68 +; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64 ; GFX10-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen offset:120 ; GFX10-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:116 ; GFX10-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen offset:112 ; GFX10-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen offset:108 ; GFX10-NEXT: s_clause 0x7 -; GFX10-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:128 -; GFX10-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:132 -; GFX10-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:136 -; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:140 -; GFX10-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:144 -; GFX10-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:148 -; GFX10-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:152 ; GFX10-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:156 +; GFX10-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:152 +; GFX10-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:148 +; GFX10-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:144 +; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:140 +; GFX10-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:136 +; GFX10-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:132 +; GFX10-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:128 ; GFX10-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:104 ; GFX10-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:100 ; GFX10-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:96 @@ -2426,14 +2426,14 @@ define amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val) #1 { ; GFX10-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:76 ; GFX10-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:72 ; GFX10-NEXT: s_clause 0x7 -; GFX10-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:96 -; GFX10-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:100 -; GFX10-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:104 -; GFX10-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:108 -; GFX10-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:112 -; GFX10-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 -; GFX10-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:120 ; GFX10-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:124 +; GFX10-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:120 +; GFX10-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 +; GFX10-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:112 +; GFX10-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:108 +; GFX10-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:104 +; GFX10-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:100 +; GFX10-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:96 ; GFX10-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:68 ; GFX10-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:64 ; GFX10-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:60 @@ -2443,14 +2443,14 @@ define amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val) #1 { ; GFX10-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:44 ; GFX10-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:40 ; GFX10-NEXT: s_clause 0x7 -; GFX10-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:32 -; GFX10-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:36 -; GFX10-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:40 -; GFX10-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:44 -; GFX10-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:48 -; GFX10-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:52 -; GFX10-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:56 ; GFX10-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:60 +; GFX10-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:56 +; GFX10-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:52 +; GFX10-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:48 +; GFX10-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:44 +; GFX10-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:40 +; GFX10-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:36 +; GFX10-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:32 ; GFX10-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:36 ; GFX10-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:32 ; GFX10-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:28 @@ -2533,24 +2533,24 @@ define amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val) #1 { ; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:168 ; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:164 ; GFX11-NEXT: s_clause 0x11 -; GFX11-NEXT: scratch_load_b32 v36, off, s32 offset:16 -; GFX11-NEXT: scratch_load_b32 v35, off, s32 offset:12 -; GFX11-NEXT: scratch_load_b32 v34, off, s32 offset:8 -; GFX11-NEXT: scratch_load_b32 v51, off, s32 offset:32 -; GFX11-NEXT: scratch_load_b32 v50, off, s32 offset:28 -; GFX11-NEXT: scratch_load_b32 v49, off, s32 offset:24 -; GFX11-NEXT: scratch_load_b32 v55, off, s32 offset:48 -; GFX11-NEXT: scratch_load_b32 v54, off, s32 offset:44 -; GFX11-NEXT: scratch_load_b32 v53, off, s32 offset:40 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:64 -; GFX11-NEXT: scratch_load_b32 v39, off, s32 offset:60 -; GFX11-NEXT: scratch_load_b32 v38, off, s32 offset:56 -; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:80 -; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:76 -; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:72 ; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:96 ; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:92 ; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:88 +; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:80 +; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:76 +; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:72 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:64 +; GFX11-NEXT: scratch_load_b32 v39, off, s32 offset:60 +; GFX11-NEXT: scratch_load_b32 v38, off, s32 offset:56 +; GFX11-NEXT: scratch_load_b32 v55, off, s32 offset:48 +; GFX11-NEXT: scratch_load_b32 v54, off, s32 offset:44 +; GFX11-NEXT: scratch_load_b32 v53, off, s32 offset:40 +; GFX11-NEXT: scratch_load_b32 v51, off, s32 offset:32 +; GFX11-NEXT: scratch_load_b32 v50, off, s32 offset:28 +; GFX11-NEXT: scratch_load_b32 v49, off, s32 offset:24 +; GFX11-NEXT: scratch_load_b32 v36, off, s32 offset:16 +; GFX11-NEXT: scratch_load_b32 v35, off, s32 offset:12 +; GFX11-NEXT: scratch_load_b32 v34, off, s32 offset:8 ; GFX11-NEXT: scratch_store_b128 v0, v[21:24], off offset:80 ; GFX11-NEXT: s_clause 0x2 ; GFX11-NEXT: scratch_load_b32 v23, off, s32 offset:112 @@ -2735,6 +2735,7 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX9-NEXT: v_mov_b32_e32 v31, 0 ; GFX9-NEXT: v_writelane_b32 v63, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[36:37] +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:516 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:636 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:640 ; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:644 @@ -2775,8 +2776,7 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s33 offset:788 ; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:792 ; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:796 -; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:516 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(40) ; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:1536 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:520 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -3009,6 +3009,7 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX10-NEXT: v_writelane_b32 v63, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[36:37] ; GFX10-NEXT: s_clause 0x28 +; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:516 ; GFX10-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:636 ; GFX10-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:640 ; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:644 @@ -3049,8 +3050,7 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX10-NEXT: buffer_load_dword v6, off, s[0:3], s33 offset:788 ; GFX10-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:792 ; GFX10-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:796 -; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:516 -; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_waitcnt vmcnt(40) ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:1536 ; 4-byte Folded Spill ; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:520 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -3262,6 +3262,7 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX11-NEXT: s_waitcnt vmcnt(1) ; GFX11-NEXT: v_mov_b32_e32 v32, v48 ; GFX11-NEXT: s_clause 0x9 +; GFX11-NEXT: scratch_load_b128 v[16:19], off, s33 offset:512 ; GFX11-NEXT: scratch_load_b128 v[48:51], off, s33 offset:656 ; GFX11-NEXT: scratch_load_b128 v[52:55], off, s33 offset:672 ; GFX11-NEXT: scratch_load_b128 v[37:40], off, s33 offset:688 @@ -3271,22 +3272,22 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX11-NEXT: scratch_load_b128 v[0:3], off, s33 offset:752 ; GFX11-NEXT: scratch_load_b128 v[4:7], off, s33 offset:768 ; GFX11-NEXT: scratch_load_b128 v[8:11], off, s33 offset:784 -; GFX11-NEXT: scratch_load_b128 v[16:19], off, s33 offset:512 -; GFX11-NEXT: s_waitcnt vmcnt(2) -; GFX11-NEXT: v_dual_mov_b32 v14, v1 :: v_dual_mov_b32 v1, v4 -; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_waitcnt vmcnt(9) ; GFX11-NEXT: scratch_store_b128 off, v[16:19], s33 offset:1584 ; 16-byte Folded Spill ; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: scratch_load_b128 v[28:31], off, s33 offset:576 ; GFX11-NEXT: scratch_load_b128 v[16:19], off, s33 offset:528 ; GFX11-NEXT: scratch_load_b128 v[20:23], off, s33 offset:544 ; GFX11-NEXT: scratch_load_b128 v[24:27], off, s33 offset:560 -; GFX11-NEXT: scratch_load_b128 v[28:31], off, s33 offset:576 +; GFX11-NEXT: s_waitcnt vmcnt(5) +; GFX11-NEXT: v_dual_mov_b32 v14, v1 :: v_dual_mov_b32 v1, v4 +; GFX11-NEXT: s_waitcnt vmcnt(4) ; GFX11-NEXT: v_dual_mov_b32 v4, v7 :: v_dual_mov_b32 v7, v10 -; GFX11-NEXT: s_waitcnt vmcnt(2) -; GFX11-NEXT: v_mov_b32_e32 v10, v21 -; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_waitcnt vmcnt(3) ; GFX11-NEXT: scratch_store_b128 off, v[28:31], s33 offset:1568 ; 16-byte Folded Spill ; GFX11-NEXT: scratch_load_b128 v[28:31], off, s33 offset:592 +; GFX11-NEXT: s_waitcnt vmcnt(2) +; GFX11-NEXT: v_mov_b32_e32 v10, v21 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: scratch_store_b128 off, v[28:31], s33 offset:1552 ; 16-byte Folded Spill ; GFX11-NEXT: scratch_load_b128 v[28:31], off, s33 offset:608 diff --git a/llvm/test/CodeGen/AMDGPU/idot4u.ll b/llvm/test/CodeGen/AMDGPU/idot4u.ll index f80d50b56f550..b183deff48fdb 100644 --- a/llvm/test/CodeGen/AMDGPU/idot4u.ll +++ b/llvm/test/CodeGen/AMDGPU/idot4u.ll @@ -648,15 +648,15 @@ define amdgpu_kernel void @udot2_8(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] +; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[6:7] ; GFX9-DL-NEXT: s_mov_b32 s0, 0xc0c0100 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) -; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s0 -; GFX9-DL-NEXT: s_waitcnt vmcnt(1) ; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s0 +; GFX9-DL-NEXT: s_waitcnt vmcnt(1) +; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, v3 ; GFX9-DL-NEXT: global_store_byte v0, v1, s[6:7] @@ -670,14 +670,14 @@ define amdgpu_kernel void @udot2_8(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1] ; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3] +; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1] ; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DL-NEXT: global_load_ubyte v3, v0, s[6:7] ; GFX10-DL-NEXT: s_waitcnt vmcnt(2) -; GFX10-DL-NEXT: v_perm_b32 v1, v1, v1, 0xc0c0100 -; GFX10-DL-NEXT: s_waitcnt vmcnt(1) ; GFX10-DL-NEXT: v_perm_b32 v2, v2, v2, 0xc0c0100 +; GFX10-DL-NEXT: s_waitcnt vmcnt(1) +; GFX10-DL-NEXT: v_perm_b32 v1, v1, v1, 0xc0c0100 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, v3 ; GFX10-DL-NEXT: global_store_byte v0, v1, s[6:7] diff --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll index f5d7bb3a45fe1..cf1997204704b 100644 --- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll @@ -1603,7 +1603,7 @@ define amdgpu_kernel void @extract_neg_offset_vgpr(ptr addrspace(1) %out) { ; NOOPT-NEXT: s_waitcnt vmcnt(0) ; NOOPT-NEXT: v_readlane_b32 s0, v31, 6 ; NOOPT-NEXT: v_readlane_b32 s1, v31, 7 -; NOOPT-NEXT: buffer_load_dword v17, off, s[20:23], 0 offset:72 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v16, off, s[20:23], 0 offset:68 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v0, off, s[20:23], 0 offset:4 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v1, off, s[20:23], 0 offset:8 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v2, off, s[20:23], 0 offset:12 ; 4-byte Folded Reload @@ -1620,12 +1620,13 @@ define amdgpu_kernel void @extract_neg_offset_vgpr(ptr addrspace(1) %out) { ; NOOPT-NEXT: buffer_load_dword v13, off, s[20:23], 0 offset:56 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v14, off, s[20:23], 0 offset:60 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v15, off, s[20:23], 0 offset:64 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v16, off, s[20:23], 0 offset:68 ; 4-byte Folded Reload -; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: buffer_load_dword v17, off, s[20:23], 0 offset:72 ; 4-byte Folded Reload +; NOOPT-NEXT: s_waitcnt vmcnt(14) ; NOOPT-NEXT: v_readfirstlane_b32 s2, v16 ; NOOPT-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v16 ; NOOPT-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; NOOPT-NEXT: s_add_i32 m0, s2, 0xfffffe00 +; NOOPT-NEXT: s_waitcnt vmcnt(1) ; NOOPT-NEXT: v_movrels_b32_e32 v0, v0 ; NOOPT-NEXT: buffer_store_dword v0, off, s[20:23], 0 offset:76 ; 4-byte Folded Spill ; NOOPT-NEXT: buffer_store_dword v0, off, s[20:23], 0 offset:72 ; 4-byte Folded Spill @@ -4119,6 +4120,7 @@ define amdgpu_kernel void @insert_neg_offset_vgpr(ptr addrspace(1) %in, ptr addr ; NOOPT-NEXT: s_waitcnt vmcnt(0) ; NOOPT-NEXT: v_readlane_b32 s0, v31, 6 ; NOOPT-NEXT: v_readlane_b32 s1, v31, 7 +; NOOPT-NEXT: buffer_load_dword v17, off, s[20:23], 0 offset:136 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v0, off, s[20:23], 0 offset:4 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v1, off, s[20:23], 0 offset:8 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v2, off, s[20:23], 0 offset:12 ; 4-byte Folded Reload @@ -4143,12 +4145,12 @@ define amdgpu_kernel void @insert_neg_offset_vgpr(ptr addrspace(1) %in, ptr addr ; NOOPT-NEXT: s_waitcnt expcnt(0) ; NOOPT-NEXT: buffer_load_dword v15, off, s[20:23], 0 offset:64 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v16, off, s[20:23], 0 offset:68 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v17, off, s[20:23], 0 offset:136 ; 4-byte Folded Reload -; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: s_waitcnt vmcnt(14) ; NOOPT-NEXT: v_readfirstlane_b32 s2, v17 ; NOOPT-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v17 ; NOOPT-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; NOOPT-NEXT: s_add_i32 m0, s2, 0xfffffe00 +; NOOPT-NEXT: s_waitcnt vmcnt(0) ; NOOPT-NEXT: v_movreld_b32_e32 v0, v16 ; NOOPT-NEXT: buffer_store_dword v0, off, s[20:23], 0 offset:140 ; 4-byte Folded Spill ; NOOPT-NEXT: buffer_store_dword v1, off, s[20:23], 0 offset:144 ; 4-byte Folded Spill @@ -4208,41 +4210,52 @@ define amdgpu_kernel void @insert_neg_offset_vgpr(ptr addrspace(1) %in, ptr addr ; NOOPT-NEXT: v_readlane_b32 s1, v31, 1 ; NOOPT-NEXT: v_readlane_b32 s2, v31, 2 ; NOOPT-NEXT: v_readlane_b32 s3, v31, 3 -; NOOPT-NEXT: buffer_load_dword v15, off, s[20:23], 0 offset:140 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v16, off, s[20:23], 0 offset:144 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v17, off, s[20:23], 0 offset:148 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v18, off, s[20:23], 0 offset:152 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v19, off, s[20:23], 0 offset:156 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v20, off, s[20:23], 0 offset:160 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v21, off, s[20:23], 0 offset:164 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v17, off, s[20:23], 0 offset:148 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v16, off, s[20:23], 0 offset:144 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v15, off, s[20:23], 0 offset:140 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v22, off, s[20:23], 0 offset:168 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v23, off, s[20:23], 0 offset:172 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v24, off, s[20:23], 0 offset:176 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v25, off, s[20:23], 0 offset:180 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v21, off, s[20:23], 0 offset:164 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v20, off, s[20:23], 0 offset:160 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v19, off, s[20:23], 0 offset:156 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v26, off, s[20:23], 0 offset:184 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v27, off, s[20:23], 0 offset:188 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v28, off, s[20:23], 0 offset:192 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v29, off, s[20:23], 0 offset:196 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v25, off, s[20:23], 0 offset:180 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v24, off, s[20:23], 0 offset:176 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v23, off, s[20:23], 0 offset:172 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v30, off, s[20:23], 0 offset:200 ; 4-byte Folded Reload -; NOOPT-NEXT: s_waitcnt vmcnt(12) +; NOOPT-NEXT: buffer_load_dword v29, off, s[20:23], 0 offset:196 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v28, off, s[20:23], 0 offset:192 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v27, off, s[20:23], 0 offset:188 ; 4-byte Folded Reload +; NOOPT-NEXT: s_waitcnt vmcnt(14) ; NOOPT-NEXT: v_mov_b32_e32 v4, v18 ; NOOPT-NEXT: v_mov_b32_e32 v5, v17 +; NOOPT-NEXT: s_waitcnt vmcnt(13) ; NOOPT-NEXT: v_mov_b32_e32 v6, v16 +; NOOPT-NEXT: s_waitcnt vmcnt(12) ; NOOPT-NEXT: v_mov_b32_e32 v0, v15 -; NOOPT-NEXT: s_waitcnt vmcnt(8) +; NOOPT-NEXT: s_waitcnt vmcnt(11) ; NOOPT-NEXT: v_mov_b32_e32 v1, v22 +; NOOPT-NEXT: s_waitcnt vmcnt(10) ; NOOPT-NEXT: v_mov_b32_e32 v2, v21 +; NOOPT-NEXT: s_waitcnt vmcnt(9) ; NOOPT-NEXT: v_mov_b32_e32 v3, v20 +; NOOPT-NEXT: s_waitcnt vmcnt(8) ; NOOPT-NEXT: v_mov_b32_e32 v7, v19 -; NOOPT-NEXT: s_waitcnt vmcnt(4) +; NOOPT-NEXT: s_waitcnt vmcnt(7) ; NOOPT-NEXT: v_mov_b32_e32 v12, v26 +; NOOPT-NEXT: s_waitcnt vmcnt(6) ; NOOPT-NEXT: v_mov_b32_e32 v13, v25 +; NOOPT-NEXT: s_waitcnt vmcnt(5) ; NOOPT-NEXT: v_mov_b32_e32 v14, v24 +; NOOPT-NEXT: s_waitcnt vmcnt(4) ; NOOPT-NEXT: v_mov_b32_e32 v8, v23 -; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: s_waitcnt vmcnt(3) ; NOOPT-NEXT: v_mov_b32_e32 v9, v30 +; NOOPT-NEXT: s_waitcnt vmcnt(2) ; NOOPT-NEXT: v_mov_b32_e32 v10, v29 +; NOOPT-NEXT: s_waitcnt vmcnt(1) ; NOOPT-NEXT: v_mov_b32_e32 v11, v28 +; NOOPT-NEXT: s_waitcnt vmcnt(0) ; NOOPT-NEXT: v_mov_b32_e32 v15, v27 ; NOOPT-NEXT: ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16_vgpr17_vgpr18 killed $exec ; NOOPT-NEXT: v_mov_b32_e32 v16, v11 @@ -4592,6 +4605,7 @@ define amdgpu_kernel void @insert_neg_inline_offset_vgpr(ptr addrspace(1) %in, p ; NOOPT-NEXT: s_waitcnt vmcnt(0) ; NOOPT-NEXT: v_readlane_b32 s0, v31, 6 ; NOOPT-NEXT: v_readlane_b32 s1, v31, 7 +; NOOPT-NEXT: buffer_load_dword v17, off, s[20:23], 0 offset:136 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v0, off, s[20:23], 0 offset:4 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v1, off, s[20:23], 0 offset:8 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v2, off, s[20:23], 0 offset:12 ; 4-byte Folded Reload @@ -4616,12 +4630,12 @@ define amdgpu_kernel void @insert_neg_inline_offset_vgpr(ptr addrspace(1) %in, p ; NOOPT-NEXT: s_waitcnt expcnt(0) ; NOOPT-NEXT: buffer_load_dword v15, off, s[20:23], 0 offset:64 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v16, off, s[20:23], 0 offset:68 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v17, off, s[20:23], 0 offset:136 ; 4-byte Folded Reload -; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: s_waitcnt vmcnt(14) ; NOOPT-NEXT: v_readfirstlane_b32 s2, v17 ; NOOPT-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v17 ; NOOPT-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; NOOPT-NEXT: s_add_i32 m0, s2, -16 +; NOOPT-NEXT: s_waitcnt vmcnt(0) ; NOOPT-NEXT: v_movreld_b32_e32 v0, v16 ; NOOPT-NEXT: buffer_store_dword v0, off, s[20:23], 0 offset:140 ; 4-byte Folded Spill ; NOOPT-NEXT: buffer_store_dword v1, off, s[20:23], 0 offset:144 ; 4-byte Folded Spill @@ -4681,41 +4695,52 @@ define amdgpu_kernel void @insert_neg_inline_offset_vgpr(ptr addrspace(1) %in, p ; NOOPT-NEXT: v_readlane_b32 s1, v31, 1 ; NOOPT-NEXT: v_readlane_b32 s2, v31, 2 ; NOOPT-NEXT: v_readlane_b32 s3, v31, 3 -; NOOPT-NEXT: buffer_load_dword v15, off, s[20:23], 0 offset:140 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v16, off, s[20:23], 0 offset:144 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v17, off, s[20:23], 0 offset:148 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v18, off, s[20:23], 0 offset:152 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v19, off, s[20:23], 0 offset:156 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v20, off, s[20:23], 0 offset:160 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v21, off, s[20:23], 0 offset:164 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v17, off, s[20:23], 0 offset:148 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v16, off, s[20:23], 0 offset:144 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v15, off, s[20:23], 0 offset:140 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v22, off, s[20:23], 0 offset:168 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v23, off, s[20:23], 0 offset:172 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v24, off, s[20:23], 0 offset:176 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v25, off, s[20:23], 0 offset:180 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v21, off, s[20:23], 0 offset:164 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v20, off, s[20:23], 0 offset:160 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v19, off, s[20:23], 0 offset:156 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v26, off, s[20:23], 0 offset:184 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v27, off, s[20:23], 0 offset:188 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v28, off, s[20:23], 0 offset:192 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v29, off, s[20:23], 0 offset:196 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v25, off, s[20:23], 0 offset:180 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v24, off, s[20:23], 0 offset:176 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v23, off, s[20:23], 0 offset:172 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v30, off, s[20:23], 0 offset:200 ; 4-byte Folded Reload -; NOOPT-NEXT: s_waitcnt vmcnt(12) +; NOOPT-NEXT: buffer_load_dword v29, off, s[20:23], 0 offset:196 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v28, off, s[20:23], 0 offset:192 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v27, off, s[20:23], 0 offset:188 ; 4-byte Folded Reload +; NOOPT-NEXT: s_waitcnt vmcnt(14) ; NOOPT-NEXT: v_mov_b32_e32 v4, v18 ; NOOPT-NEXT: v_mov_b32_e32 v5, v17 +; NOOPT-NEXT: s_waitcnt vmcnt(13) ; NOOPT-NEXT: v_mov_b32_e32 v6, v16 +; NOOPT-NEXT: s_waitcnt vmcnt(12) ; NOOPT-NEXT: v_mov_b32_e32 v0, v15 -; NOOPT-NEXT: s_waitcnt vmcnt(8) +; NOOPT-NEXT: s_waitcnt vmcnt(11) ; NOOPT-NEXT: v_mov_b32_e32 v1, v22 +; NOOPT-NEXT: s_waitcnt vmcnt(10) ; NOOPT-NEXT: v_mov_b32_e32 v2, v21 +; NOOPT-NEXT: s_waitcnt vmcnt(9) ; NOOPT-NEXT: v_mov_b32_e32 v3, v20 +; NOOPT-NEXT: s_waitcnt vmcnt(8) ; NOOPT-NEXT: v_mov_b32_e32 v7, v19 -; NOOPT-NEXT: s_waitcnt vmcnt(4) +; NOOPT-NEXT: s_waitcnt vmcnt(7) ; NOOPT-NEXT: v_mov_b32_e32 v12, v26 +; NOOPT-NEXT: s_waitcnt vmcnt(6) ; NOOPT-NEXT: v_mov_b32_e32 v13, v25 +; NOOPT-NEXT: s_waitcnt vmcnt(5) ; NOOPT-NEXT: v_mov_b32_e32 v14, v24 +; NOOPT-NEXT: s_waitcnt vmcnt(4) ; NOOPT-NEXT: v_mov_b32_e32 v8, v23 -; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: s_waitcnt vmcnt(3) ; NOOPT-NEXT: v_mov_b32_e32 v9, v30 +; NOOPT-NEXT: s_waitcnt vmcnt(2) ; NOOPT-NEXT: v_mov_b32_e32 v10, v29 +; NOOPT-NEXT: s_waitcnt vmcnt(1) ; NOOPT-NEXT: v_mov_b32_e32 v11, v28 +; NOOPT-NEXT: s_waitcnt vmcnt(0) ; NOOPT-NEXT: v_mov_b32_e32 v15, v27 ; NOOPT-NEXT: ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16_vgpr17_vgpr18 killed $exec ; NOOPT-NEXT: v_mov_b32_e32 v16, v11 @@ -5132,7 +5157,7 @@ define amdgpu_kernel void @extract_vgpr_offset_multiple_in_block(ptr addrspace(1 ; NOOPT-NEXT: s_waitcnt vmcnt(0) ; NOOPT-NEXT: v_readlane_b32 s0, v18, 23 ; NOOPT-NEXT: v_readlane_b32 s1, v18, 24 -; NOOPT-NEXT: buffer_load_dword v17, off, s[36:39], 0 offset:80 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v16, off, s[36:39], 0 offset:72 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v0, off, s[36:39], 0 offset:4 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v1, off, s[36:39], 0 offset:8 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v2, off, s[36:39], 0 offset:12 ; 4-byte Folded Reload @@ -5149,12 +5174,13 @@ define amdgpu_kernel void @extract_vgpr_offset_multiple_in_block(ptr addrspace(1 ; NOOPT-NEXT: buffer_load_dword v13, off, s[36:39], 0 offset:56 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v14, off, s[36:39], 0 offset:60 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v15, off, s[36:39], 0 offset:64 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v16, off, s[36:39], 0 offset:72 ; 4-byte Folded Reload -; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: buffer_load_dword v17, off, s[36:39], 0 offset:80 ; 4-byte Folded Reload +; NOOPT-NEXT: s_waitcnt vmcnt(14) ; NOOPT-NEXT: v_readfirstlane_b32 s2, v16 ; NOOPT-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v16 ; NOOPT-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; NOOPT-NEXT: s_mov_b32 m0, s2 +; NOOPT-NEXT: s_waitcnt vmcnt(1) ; NOOPT-NEXT: v_movrels_b32_e32 v0, v0 ; NOOPT-NEXT: buffer_store_dword v0, off, s[36:39], 0 offset:84 ; 4-byte Folded Spill ; NOOPT-NEXT: buffer_store_dword v0, off, s[36:39], 0 offset:80 ; 4-byte Folded Spill @@ -5249,7 +5275,7 @@ define amdgpu_kernel void @extract_vgpr_offset_multiple_in_block(ptr addrspace(1 ; NOOPT-NEXT: s_waitcnt vmcnt(0) ; NOOPT-NEXT: v_readlane_b32 s0, v18, 28 ; NOOPT-NEXT: v_readlane_b32 s1, v18, 29 -; NOOPT-NEXT: buffer_load_dword v17, off, s[36:39], 0 offset:152 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v16, off, s[36:39], 0 offset:68 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v0, off, s[36:39], 0 offset:88 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v1, off, s[36:39], 0 offset:92 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v2, off, s[36:39], 0 offset:96 ; 4-byte Folded Reload @@ -5266,12 +5292,13 @@ define amdgpu_kernel void @extract_vgpr_offset_multiple_in_block(ptr addrspace(1 ; NOOPT-NEXT: buffer_load_dword v13, off, s[36:39], 0 offset:140 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v14, off, s[36:39], 0 offset:144 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v15, off, s[36:39], 0 offset:148 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v16, off, s[36:39], 0 offset:68 ; 4-byte Folded Reload -; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: buffer_load_dword v17, off, s[36:39], 0 offset:152 ; 4-byte Folded Reload +; NOOPT-NEXT: s_waitcnt vmcnt(14) ; NOOPT-NEXT: v_readfirstlane_b32 s2, v16 ; NOOPT-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v16 ; NOOPT-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; NOOPT-NEXT: s_mov_b32 m0, s2 +; NOOPT-NEXT: s_waitcnt vmcnt(1) ; NOOPT-NEXT: v_movrels_b32_e32 v0, v0 ; NOOPT-NEXT: buffer_store_dword v0, off, s[36:39], 0 offset:156 ; 4-byte Folded Spill ; NOOPT-NEXT: buffer_store_dword v0, off, s[36:39], 0 offset:152 ; 4-byte Folded Spill @@ -5302,10 +5329,10 @@ define amdgpu_kernel void @extract_vgpr_offset_multiple_in_block(ptr addrspace(1 ; NOOPT-NEXT: v_readlane_b32 s5, v18, 1 ; NOOPT-NEXT: v_readlane_b32 s6, v18, 2 ; NOOPT-NEXT: v_readlane_b32 s7, v18, 3 -; NOOPT-NEXT: buffer_load_dword v0, off, s[36:39], 0 offset:76 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v1, off, s[36:39], 0 offset:156 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v2, off, s[36:39], 0 offset:84 ; 4-byte Folded Reload -; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: buffer_load_dword v1, off, s[36:39], 0 offset:156 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v0, off, s[36:39], 0 offset:76 ; 4-byte Folded Reload +; NOOPT-NEXT: s_waitcnt vmcnt(2) ; NOOPT-NEXT: buffer_store_dword v2, off, s[4:7], 0 ; NOOPT-NEXT: s_waitcnt vmcnt(0) ; NOOPT-NEXT: buffer_store_dword v1, off, s[4:7], 0 @@ -5860,6 +5887,7 @@ define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1) ; NOOPT-NEXT: s_waitcnt vmcnt(0) ; NOOPT-NEXT: v_readlane_b32 s0, v32, 7 ; NOOPT-NEXT: v_readlane_b32 s1, v32, 8 +; NOOPT-NEXT: buffer_load_dword v17, off, s[28:31], 0 offset:80 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v0, off, s[28:31], 0 offset:4 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v1, off, s[28:31], 0 offset:8 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v2, off, s[28:31], 0 offset:12 ; 4-byte Folded Reload @@ -5884,12 +5912,12 @@ define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1) ; NOOPT-NEXT: s_waitcnt expcnt(0) ; NOOPT-NEXT: buffer_load_dword v15, off, s[28:31], 0 offset:64 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v16, off, s[28:31], 0 offset:72 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v17, off, s[28:31], 0 offset:80 ; 4-byte Folded Reload -; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: s_waitcnt vmcnt(14) ; NOOPT-NEXT: v_readfirstlane_b32 s2, v17 ; NOOPT-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v17 ; NOOPT-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; NOOPT-NEXT: s_mov_b32 m0, s2 +; NOOPT-NEXT: s_waitcnt vmcnt(0) ; NOOPT-NEXT: v_movreld_b32_e32 v0, v16 ; NOOPT-NEXT: buffer_store_dword v0, off, s[28:31], 0 offset:88 ; 4-byte Folded Spill ; NOOPT-NEXT: buffer_store_dword v1, off, s[28:31], 0 offset:92 ; 4-byte Folded Spill @@ -5998,6 +6026,7 @@ define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1) ; NOOPT-NEXT: s_waitcnt vmcnt(0) ; NOOPT-NEXT: v_readlane_b32 s0, v32, 11 ; NOOPT-NEXT: v_readlane_b32 s1, v32, 12 +; NOOPT-NEXT: buffer_load_dword v17, off, s[28:31], 0 offset:76 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v0, off, s[28:31], 0 offset:152 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v1, off, s[28:31], 0 offset:156 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v2, off, s[28:31], 0 offset:160 ; 4-byte Folded Reload @@ -6022,12 +6051,12 @@ define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1) ; NOOPT-NEXT: s_waitcnt expcnt(0) ; NOOPT-NEXT: buffer_load_dword v15, off, s[28:31], 0 offset:212 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v16, off, s[28:31], 0 offset:216 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v17, off, s[28:31], 0 offset:76 ; 4-byte Folded Reload -; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: s_waitcnt vmcnt(14) ; NOOPT-NEXT: v_readfirstlane_b32 s2, v17 ; NOOPT-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v17 ; NOOPT-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; NOOPT-NEXT: s_mov_b32 m0, s2 +; NOOPT-NEXT: s_waitcnt vmcnt(0) ; NOOPT-NEXT: v_movreld_b32_e32 v0, v16 ; NOOPT-NEXT: buffer_store_dword v0, off, s[28:31], 0 offset:220 ; 4-byte Folded Spill ; NOOPT-NEXT: buffer_store_dword v1, off, s[28:31], 0 offset:224 ; 4-byte Folded Spill @@ -6088,42 +6117,52 @@ define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1) ; NOOPT-NEXT: v_readlane_b32 s5, v32, 1 ; NOOPT-NEXT: v_readlane_b32 s6, v32, 2 ; NOOPT-NEXT: v_readlane_b32 s7, v32, 3 -; NOOPT-NEXT: buffer_load_dword v0, off, s[28:31], 0 offset:84 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v16, off, s[28:31], 0 offset:220 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v17, off, s[28:31], 0 offset:224 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v18, off, s[28:31], 0 offset:228 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v19, off, s[28:31], 0 offset:232 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v20, off, s[28:31], 0 offset:236 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v21, off, s[28:31], 0 offset:240 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v22, off, s[28:31], 0 offset:244 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v18, off, s[28:31], 0 offset:228 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v17, off, s[28:31], 0 offset:224 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v16, off, s[28:31], 0 offset:220 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v23, off, s[28:31], 0 offset:248 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v24, off, s[28:31], 0 offset:252 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v25, off, s[28:31], 0 offset:256 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v26, off, s[28:31], 0 offset:260 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v22, off, s[28:31], 0 offset:244 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v21, off, s[28:31], 0 offset:240 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v20, off, s[28:31], 0 offset:236 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v27, off, s[28:31], 0 offset:264 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v28, off, s[28:31], 0 offset:268 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v29, off, s[28:31], 0 offset:272 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v30, off, s[28:31], 0 offset:276 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v26, off, s[28:31], 0 offset:260 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v25, off, s[28:31], 0 offset:256 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v24, off, s[28:31], 0 offset:252 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v31, off, s[28:31], 0 offset:280 ; 4-byte Folded Reload -; NOOPT-NEXT: s_waitcnt vmcnt(12) +; NOOPT-NEXT: buffer_load_dword v30, off, s[28:31], 0 offset:276 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v29, off, s[28:31], 0 offset:272 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v28, off, s[28:31], 0 offset:268 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v0, off, s[28:31], 0 offset:84 ; 4-byte Folded Reload +; NOOPT-NEXT: s_waitcnt vmcnt(14) ; NOOPT-NEXT: v_mov_b32_e32 v5, v19 ; NOOPT-NEXT: v_mov_b32_e32 v6, v18 ; NOOPT-NEXT: v_mov_b32_e32 v7, v17 +; NOOPT-NEXT: s_waitcnt vmcnt(13) ; NOOPT-NEXT: v_mov_b32_e32 v1, v16 -; NOOPT-NEXT: s_waitcnt vmcnt(8) +; NOOPT-NEXT: s_waitcnt vmcnt(12) ; NOOPT-NEXT: v_mov_b32_e32 v2, v23 +; NOOPT-NEXT: s_waitcnt vmcnt(11) ; NOOPT-NEXT: v_mov_b32_e32 v3, v22 +; NOOPT-NEXT: s_waitcnt vmcnt(10) ; NOOPT-NEXT: v_mov_b32_e32 v4, v21 +; NOOPT-NEXT: s_waitcnt vmcnt(9) ; NOOPT-NEXT: v_mov_b32_e32 v8, v20 -; NOOPT-NEXT: s_waitcnt vmcnt(4) +; NOOPT-NEXT: s_waitcnt vmcnt(8) ; NOOPT-NEXT: v_mov_b32_e32 v13, v27 +; NOOPT-NEXT: s_waitcnt vmcnt(7) ; NOOPT-NEXT: v_mov_b32_e32 v14, v26 +; NOOPT-NEXT: s_waitcnt vmcnt(6) ; NOOPT-NEXT: v_mov_b32_e32 v15, v25 +; NOOPT-NEXT: s_waitcnt vmcnt(5) ; NOOPT-NEXT: v_mov_b32_e32 v9, v24 -; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: s_waitcnt vmcnt(4) ; NOOPT-NEXT: v_mov_b32_e32 v10, v31 +; NOOPT-NEXT: s_waitcnt vmcnt(3) ; NOOPT-NEXT: v_mov_b32_e32 v11, v30 +; NOOPT-NEXT: s_waitcnt vmcnt(2) ; NOOPT-NEXT: v_mov_b32_e32 v12, v29 +; NOOPT-NEXT: s_waitcnt vmcnt(1) ; NOOPT-NEXT: v_mov_b32_e32 v16, v28 ; NOOPT-NEXT: ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17_vgpr18_vgpr19 killed $exec ; NOOPT-NEXT: v_mov_b32_e32 v17, v12 @@ -9094,6 +9133,7 @@ define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) { ; NOOPT-NEXT: s_waitcnt vmcnt(0) ; NOOPT-NEXT: v_readlane_b32 s0, v18, 6 ; NOOPT-NEXT: v_readlane_b32 s1, v18, 7 +; NOOPT-NEXT: buffer_load_dword v17, off, s[24:27], 0 offset:80 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v0, off, s[24:27], 0 offset:12 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v1, off, s[24:27], 0 offset:16 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v2, off, s[24:27], 0 offset:20 ; 4-byte Folded Reload @@ -9118,12 +9158,12 @@ define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) { ; NOOPT-NEXT: s_waitcnt expcnt(0) ; NOOPT-NEXT: buffer_load_dword v15, off, s[24:27], 0 offset:72 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v16, off, s[24:27], 0 offset:76 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v17, off, s[24:27], 0 offset:80 ; 4-byte Folded Reload -; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: s_waitcnt vmcnt(14) ; NOOPT-NEXT: v_readfirstlane_b32 s2, v17 ; NOOPT-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v17 ; NOOPT-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; NOOPT-NEXT: s_mov_b32 m0, s2 +; NOOPT-NEXT: s_waitcnt vmcnt(0) ; NOOPT-NEXT: v_movreld_b32_e32 v0, v16 ; NOOPT-NEXT: buffer_store_dword v0, off, s[24:27], 0 offset:84 ; 4-byte Folded Spill ; NOOPT-NEXT: buffer_store_dword v1, off, s[24:27], 0 offset:88 ; 4-byte Folded Spill @@ -9580,6 +9620,7 @@ define amdgpu_cs void @insert_or_disj_index(ptr addrspace(1) %out, ptr addrspace ; NOOPT-NEXT: s_waitcnt vmcnt(0) ; NOOPT-NEXT: v_readlane_b32 s0, v33, 9 ; NOOPT-NEXT: v_readlane_b32 s1, v33, 10 +; NOOPT-NEXT: buffer_load_dword v17, off, s[16:19], 0 offset:132 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:4 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:8 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v2, off, s[16:19], 0 offset:12 ; 4-byte Folded Reload @@ -9604,12 +9645,12 @@ define amdgpu_cs void @insert_or_disj_index(ptr addrspace(1) %out, ptr addrspace ; NOOPT-NEXT: s_waitcnt expcnt(0) ; NOOPT-NEXT: buffer_load_dword v15, off, s[16:19], 0 offset:64 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v16, off, s[16:19], 0 offset:144 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v17, off, s[16:19], 0 offset:132 ; 4-byte Folded Reload -; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: s_waitcnt vmcnt(14) ; NOOPT-NEXT: v_readfirstlane_b32 s2, v17 ; NOOPT-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v17 ; NOOPT-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; NOOPT-NEXT: s_mov_b32 m0, s2 +; NOOPT-NEXT: s_waitcnt vmcnt(0) ; NOOPT-NEXT: v_movreld_b32_e32 v1, v16 ; NOOPT-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:148 ; 4-byte Folded Spill ; NOOPT-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:152 ; 4-byte Folded Spill @@ -9669,48 +9710,58 @@ define amdgpu_cs void @insert_or_disj_index(ptr addrspace(1) %out, ptr addrspace ; NOOPT-NEXT: v_readlane_b32 s1, v33, 4 ; NOOPT-NEXT: v_readlane_b32 s2, v33, 5 ; NOOPT-NEXT: v_readlane_b32 s3, v33, 6 -; NOOPT-NEXT: buffer_load_dword v4, off, s[16:19], 0 offset:136 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v5, off, s[16:19], 0 offset:140 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v17, off, s[16:19], 0 offset:148 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v18, off, s[16:19], 0 offset:152 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v19, off, s[16:19], 0 offset:156 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v20, off, s[16:19], 0 offset:160 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v21, off, s[16:19], 0 offset:164 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v22, off, s[16:19], 0 offset:168 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v23, off, s[16:19], 0 offset:172 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v19, off, s[16:19], 0 offset:156 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v18, off, s[16:19], 0 offset:152 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v17, off, s[16:19], 0 offset:148 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v24, off, s[16:19], 0 offset:176 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v25, off, s[16:19], 0 offset:180 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v26, off, s[16:19], 0 offset:184 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v27, off, s[16:19], 0 offset:188 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v23, off, s[16:19], 0 offset:172 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v22, off, s[16:19], 0 offset:168 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v21, off, s[16:19], 0 offset:164 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v28, off, s[16:19], 0 offset:192 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v29, off, s[16:19], 0 offset:196 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v30, off, s[16:19], 0 offset:200 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v31, off, s[16:19], 0 offset:204 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v27, off, s[16:19], 0 offset:188 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v26, off, s[16:19], 0 offset:184 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v25, off, s[16:19], 0 offset:180 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v32, off, s[16:19], 0 offset:208 ; 4-byte Folded Reload -; NOOPT-NEXT: s_waitcnt vmcnt(12) +; NOOPT-NEXT: buffer_load_dword v31, off, s[16:19], 0 offset:204 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v30, off, s[16:19], 0 offset:200 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v29, off, s[16:19], 0 offset:196 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v4, off, s[16:19], 0 offset:136 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v5, off, s[16:19], 0 offset:140 ; 4-byte Folded Reload +; NOOPT-NEXT: s_waitcnt vmcnt(14) ; NOOPT-NEXT: v_mov_b32_e32 v6, v20 ; NOOPT-NEXT: v_mov_b32_e32 v7, v19 ; NOOPT-NEXT: v_mov_b32_e32 v8, v18 ; NOOPT-NEXT: v_mov_b32_e32 v0, v17 -; NOOPT-NEXT: s_waitcnt vmcnt(8) +; NOOPT-NEXT: s_waitcnt vmcnt(13) ; NOOPT-NEXT: v_mov_b32_e32 v1, v24 +; NOOPT-NEXT: s_waitcnt vmcnt(12) ; NOOPT-NEXT: v_mov_b32_e32 v2, v23 +; NOOPT-NEXT: s_waitcnt vmcnt(11) ; NOOPT-NEXT: v_mov_b32_e32 v3, v22 +; NOOPT-NEXT: s_waitcnt vmcnt(10) ; NOOPT-NEXT: v_mov_b32_e32 v9, v21 -; NOOPT-NEXT: s_waitcnt vmcnt(4) +; NOOPT-NEXT: s_waitcnt vmcnt(9) ; NOOPT-NEXT: v_mov_b32_e32 v14, v28 +; NOOPT-NEXT: s_waitcnt vmcnt(8) ; NOOPT-NEXT: v_mov_b32_e32 v15, v27 +; NOOPT-NEXT: s_waitcnt vmcnt(7) ; NOOPT-NEXT: v_mov_b32_e32 v16, v26 +; NOOPT-NEXT: s_waitcnt vmcnt(6) ; NOOPT-NEXT: v_mov_b32_e32 v10, v25 -; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: s_waitcnt vmcnt(5) ; NOOPT-NEXT: v_mov_b32_e32 v11, v32 +; NOOPT-NEXT: s_waitcnt vmcnt(4) ; NOOPT-NEXT: v_mov_b32_e32 v12, v31 +; NOOPT-NEXT: s_waitcnt vmcnt(3) ; NOOPT-NEXT: v_mov_b32_e32 v13, v30 +; NOOPT-NEXT: s_waitcnt vmcnt(2) ; NOOPT-NEXT: v_mov_b32_e32 v17, v29 ; NOOPT-NEXT: ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18_vgpr19_vgpr20 killed $exec ; NOOPT-NEXT: v_mov_b32_e32 v18, v13 ; NOOPT-NEXT: v_mov_b32_e32 v19, v12 ; NOOPT-NEXT: v_mov_b32_e32 v20, v11 +; NOOPT-NEXT: s_waitcnt vmcnt(0) ; NOOPT-NEXT: v_mov_b32_e32 v12, v5 ; NOOPT-NEXT: v_mov_b32_e32 v11, v4 ; NOOPT-NEXT: buffer_store_dwordx4 v[17:20], v[11:12], s[0:3], 0 addr64 offset:48 diff --git a/llvm/test/CodeGen/AMDGPU/issue92561-restore-undef-scc-verifier-error.ll b/llvm/test/CodeGen/AMDGPU/issue92561-restore-undef-scc-verifier-error.ll index 0d3f342f7735e..a7fb563e6698c 100644 --- a/llvm/test/CodeGen/AMDGPU/issue92561-restore-undef-scc-verifier-error.ll +++ b/llvm/test/CodeGen/AMDGPU/issue92561-restore-undef-scc-verifier-error.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefix=SDAG %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefix=GISEL %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefix=SDAG %s +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL %s ; Check for verifier error due to trying to save and restore SCC ; around a waterfall looop when it was never defined. We have to get @@ -59,14 +59,15 @@ define void @issue92561(ptr addrspace(1) %arg) { ; SDAG-NEXT: s_mov_b32 s7, s12 ; SDAG-NEXT: s_clause 0x2 ; SDAG-NEXT: image_sample_c_lz v0, [v1, v1, v0, v1], s[0:7], s[12:15] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY -; SDAG-NEXT: image_sample_c_lz v3, [v1, v1, v1, v1], s[0:7], s[12:15] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY ; SDAG-NEXT: image_sample_c_lz v2, [v1, v2, v1, v1], s[0:7], s[12:15] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY +; SDAG-NEXT: image_sample_c_lz v3, [v1, v1, v1, v1], s[0:7], s[12:15] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY ; SDAG-NEXT: s_waitcnt vmcnt(2) ; SDAG-NEXT: v_add_f32_e32 v0, v9, v0 -; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; SDAG-NEXT: s_waitcnt vmcnt(1) +; SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; SDAG-NEXT: v_add_f32_e32 v0, v2, v0 ; SDAG-NEXT: v_mov_b32_e32 v2, v1 +; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: v_add_f32_e32 v0, v3, v0 ; SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-NEXT: v_mul_f32_e32 v0, 0x3e800000, v0 diff --git a/llvm/test/CodeGen/AMDGPU/kernel-args.ll b/llvm/test/CodeGen/AMDGPU/kernel-args.ll index a2da8876472ab..c1078e280621c 100644 --- a/llvm/test/CodeGen/AMDGPU/kernel-args.ll +++ b/llvm/test/CodeGen/AMDGPU/kernel-args.ll @@ -5199,10 +5199,10 @@ define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_load_dword s2, s[4:5], 0x9 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa -; SI-NEXT: buffer_load_ubyte v4, off, s[4:7], 0 offset:49 ; SI-NEXT: buffer_load_ubyte v5, off, s[4:7], 0 offset:50 -; SI-NEXT: buffer_load_ubyte v6, off, s[4:7], 0 offset:51 ; SI-NEXT: buffer_load_ubyte v7, off, s[4:7], 0 offset:52 +; SI-NEXT: buffer_load_ubyte v4, off, s[4:7], 0 offset:49 +; SI-NEXT: buffer_load_ubyte v6, off, s[4:7], 0 offset:51 ; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:53 ; SI-NEXT: s_mov_b64 s[4:5], 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -5684,17 +5684,17 @@ define amdgpu_kernel void @array_3xi16(i8 %arg0, [3 x i16] %arg1) { ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: s_add_u32 s0, s4, 42 ; VI-NEXT: s_addc_u32 s1, s5, 0 -; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: flat_load_ushort v0, v[0:1] ; VI-NEXT: flat_load_ushort v4, v[4:5] ; VI-NEXT: flat_load_ushort v2, v[2:3] -; VI-NEXT: flat_load_ushort v0, v[0:1] ; VI-NEXT: s_load_dword s0, s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s0 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: flat_store_byte v[0:1], v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: flat_store_short v[0:1], v4 diff --git a/llvm/test/CodeGen/AMDGPU/lds-misaligned-bug.ll b/llvm/test/CodeGen/AMDGPU/lds-misaligned-bug.ll index 69a871f6f6ae5..a33fd0eae8726 100644 --- a/llvm/test/CodeGen/AMDGPU/lds-misaligned-bug.ll +++ b/llvm/test/CodeGen/AMDGPU/lds-misaligned-bug.ll @@ -341,14 +341,15 @@ define amdgpu_kernel void @test_flat_misaligned_v4(ptr %arg) { ; SPLIT-NEXT: v_add_co_u32 v6, vcc_lo, v0, 8 ; SPLIT-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo ; SPLIT-NEXT: s_clause 0x3 -; SPLIT-NEXT: flat_load_dword v8, v[2:3] ; SPLIT-NEXT: flat_load_dword v9, v[4:5] ; SPLIT-NEXT: flat_load_dword v10, v[0:1] +; SPLIT-NEXT: flat_load_dword v8, v[2:3] ; SPLIT-NEXT: flat_load_dword v11, v[6:7] -; SPLIT-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) +; SPLIT-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) ; SPLIT-NEXT: flat_store_dword v[6:7], v9 -; SPLIT-NEXT: s_waitcnt vmcnt(1) lgkmcnt(2) +; SPLIT-NEXT: s_waitcnt vmcnt(2) lgkmcnt(3) ; SPLIT-NEXT: flat_store_dword v[2:3], v10 +; SPLIT-NEXT: s_waitcnt vmcnt(1) lgkmcnt(3) ; SPLIT-NEXT: flat_store_dword v[0:1], v8 ; SPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(3) ; SPLIT-NEXT: flat_store_dword v[4:5], v11 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll index efb55db486489..eea59d943279b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll @@ -807,22 +807,22 @@ define amdgpu_kernel void @fmuladd_v2f16( ; VI-FLUSH-NEXT: s_mov_b32 s14, s10 ; VI-FLUSH-NEXT: s_mov_b32 s15, s11 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) -; VI-FLUSH-NEXT: s_mov_b32 s12, s2 -; VI-FLUSH-NEXT: s_mov_b32 s13, s3 ; VI-FLUSH-NEXT: s_mov_b32 s16, s4 ; VI-FLUSH-NEXT: s_mov_b32 s17, s5 -; VI-FLUSH-NEXT: s_mov_b32 s18, s10 -; VI-FLUSH-NEXT: s_mov_b32 s19, s11 ; VI-FLUSH-NEXT: s_mov_b32 s4, s6 ; VI-FLUSH-NEXT: s_mov_b32 s5, s7 ; VI-FLUSH-NEXT: s_mov_b32 s6, s10 ; VI-FLUSH-NEXT: s_mov_b32 s7, s11 -; VI-FLUSH-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; VI-FLUSH-NEXT: s_mov_b32 s12, s2 +; VI-FLUSH-NEXT: s_mov_b32 s13, s3 +; VI-FLUSH-NEXT: s_mov_b32 s18, s10 +; VI-FLUSH-NEXT: s_mov_b32 s19, s11 ; VI-FLUSH-NEXT: buffer_load_dword v1, off, s[4:7], 0 +; VI-FLUSH-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; VI-FLUSH-NEXT: buffer_load_dword v2, off, s[16:19], 0 ; VI-FLUSH-NEXT: s_mov_b32 s8, s0 ; VI-FLUSH-NEXT: s_mov_b32 s9, s1 -; VI-FLUSH-NEXT: s_waitcnt vmcnt(1) +; VI-FLUSH-NEXT: s_waitcnt vmcnt(2) ; VI-FLUSH-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) ; VI-FLUSH-NEXT: v_mac_f16_sdwa v3, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll index f971080e02c5b..515b9a425b2d7 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll @@ -2399,9 +2399,9 @@ define <16 x double> @v_maximum_v16f64(<16 x double> %src0, <16 x double> %src1) ; GFX950-NEXT: scratch_load_dword v50, off, s32 offset:84 ; GFX950-NEXT: scratch_load_dword v49, off, s32 offset:96 ; GFX950-NEXT: scratch_load_dword v48, off, s32 offset:92 -; GFX950-NEXT: scratch_load_dword v31, off, s32 ; GFX950-NEXT: scratch_load_dword v35, off, s32 offset:104 ; GFX950-NEXT: scratch_load_dword v34, off, s32 offset:100 +; GFX950-NEXT: scratch_load_dword v31, off, s32 ; GFX950-NEXT: v_accvgpr_write_b32 a10, v58 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_write_b32 a11, v59 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_write_b32 a12, v60 ; Reload Reuse @@ -2485,7 +2485,7 @@ define <16 x double> @v_maximum_v16f64(<16 x double> %src0, <16 x double> %src1) ; GFX950-NEXT: v_accvgpr_read_b32 v43, a3 ; Reload Reuse ; GFX950-NEXT: v_cndmask_b32_e64 v22, v0, 0, vcc ; GFX950-NEXT: v_cndmask_b32_e32 v23, v1, v2, vcc -; GFX950-NEXT: s_waitcnt vmcnt(6) +; GFX950-NEXT: s_waitcnt vmcnt(7) ; GFX950-NEXT: v_max_f64 v[0:1], v[24:25], v[34:35] ; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[24:25], v[34:35] ; GFX950-NEXT: v_accvgpr_read_b32 v42, a2 ; Reload Reuse @@ -2529,25 +2529,25 @@ define <16 x double> @v_maximum_v16f64(<16 x double> %src0, <16 x double> %src1) ; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12 ; GFX10-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:24 ; GFX10-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:20 -; GFX10-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:36 ; GFX10-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:32 ; GFX10-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:28 -; GFX10-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:68 -; GFX10-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:64 -; GFX10-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:60 -; GFX10-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:56 -; GFX10-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:52 +; GFX10-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:36 +; GFX10-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:40 ; GFX10-NEXT: buffer_load_dword v65, off, s[0:3], s32 offset:48 ; GFX10-NEXT: buffer_load_dword v64, off, s[0:3], s32 offset:44 -; GFX10-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:40 -; GFX10-NEXT: buffer_load_dword v66, off, s[0:3], s32 offset:100 -; GFX10-NEXT: buffer_load_dword v69, off, s[0:3], s32 offset:96 -; GFX10-NEXT: buffer_load_dword v68, off, s[0:3], s32 offset:92 -; GFX10-NEXT: buffer_load_dword v71, off, s[0:3], s32 offset:88 -; GFX10-NEXT: buffer_load_dword v70, off, s[0:3], s32 offset:84 +; GFX10-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:56 +; GFX10-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:52 +; GFX10-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:64 +; GFX10-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:60 +; GFX10-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:68 +; GFX10-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:72 ; GFX10-NEXT: buffer_load_dword v81, off, s[0:3], s32 offset:80 ; GFX10-NEXT: buffer_load_dword v80, off, s[0:3], s32 offset:76 -; GFX10-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:72 +; GFX10-NEXT: buffer_load_dword v71, off, s[0:3], s32 offset:88 +; GFX10-NEXT: buffer_load_dword v70, off, s[0:3], s32 offset:84 +; GFX10-NEXT: buffer_load_dword v69, off, s[0:3], s32 offset:96 +; GFX10-NEXT: buffer_load_dword v68, off, s[0:3], s32 offset:92 +; GFX10-NEXT: buffer_load_dword v66, off, s[0:3], s32 offset:100 ; GFX10-NEXT: s_waitcnt vmcnt(23) ; GFX10-NEXT: v_max_f64 v[82:83], v[0:1], v[31:32] ; GFX10-NEXT: v_cmp_u_f64_e32 vcc_lo, v[0:1], v[31:32] @@ -2558,38 +2558,39 @@ define <16 x double> @v_maximum_v16f64(<16 x double> %src0, <16 x double> %src1) ; GFX10-NEXT: v_max_f64 v[32:33], v[4:5], v[35:36] ; GFX10-NEXT: v_cmp_u_f64_e64 s5, v[4:5], v[35:36] ; GFX10-NEXT: s_clause 0x7 -; GFX10-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; GFX10-NEXT: buffer_load_dword v67, off, s[0:3], s32 offset:104 +; GFX10-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:108 ; GFX10-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; GFX10-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX10-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:128 ; GFX10-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:124 -; GFX10-NEXT: s_waitcnt vmcnt(24) +; GFX10-NEXT: s_waitcnt vmcnt(25) ; GFX10-NEXT: v_max_f64 v[34:35], v[6:7], v[48:49] ; GFX10-NEXT: v_cmp_u_f64_e64 s6, v[6:7], v[48:49] -; GFX10-NEXT: s_waitcnt vmcnt(21) -; GFX10-NEXT: v_cmp_u_f64_e64 s10, v[14:15], v[52:53] -; GFX10-NEXT: s_waitcnt vmcnt(19) -; GFX10-NEXT: v_cmp_u_f64_e64 s9, v[12:13], v[54:55] -; GFX10-NEXT: s_waitcnt vmcnt(17) -; GFX10-NEXT: v_cmp_u_f64_e64 s8, v[10:11], v[64:65] -; GFX10-NEXT: s_waitcnt vmcnt(16) +; GFX10-NEXT: s_waitcnt vmcnt(23) ; GFX10-NEXT: v_max_f64 v[48:49], v[8:9], v[37:38] ; GFX10-NEXT: v_cmp_u_f64_e64 s7, v[8:9], v[37:38] +; GFX10-NEXT: s_waitcnt vmcnt(21) ; GFX10-NEXT: v_max_f64 v[36:37], v[10:11], v[64:65] +; GFX10-NEXT: v_cmp_u_f64_e64 s8, v[10:11], v[64:65] +; GFX10-NEXT: s_waitcnt vmcnt(19) ; GFX10-NEXT: v_max_f64 v[38:39], v[12:13], v[54:55] +; GFX10-NEXT: v_cmp_u_f64_e64 s9, v[12:13], v[54:55] +; GFX10-NEXT: s_waitcnt vmcnt(17) ; GFX10-NEXT: v_max_f64 v[54:55], v[14:15], v[52:53] +; GFX10-NEXT: v_cmp_u_f64_e64 s10, v[14:15], v[52:53] +; GFX10-NEXT: s_waitcnt vmcnt(15) +; GFX10-NEXT: v_max_f64 v[52:53], v[16:17], v[50:51] +; GFX10-NEXT: v_cmp_u_f64_e64 s11, v[16:17], v[50:51] +; GFX10-NEXT: s_waitcnt vmcnt(13) +; GFX10-NEXT: v_max_f64 v[50:51], v[18:19], v[80:81] +; GFX10-NEXT: v_cmp_u_f64_e64 s12, v[18:19], v[80:81] ; GFX10-NEXT: s_waitcnt vmcnt(11) ; GFX10-NEXT: v_max_f64 v[64:65], v[20:21], v[70:71] ; GFX10-NEXT: v_cmp_u_f64_e64 s13, v[20:21], v[70:71] ; GFX10-NEXT: s_waitcnt vmcnt(9) -; GFX10-NEXT: v_cmp_u_f64_e64 s12, v[18:19], v[80:81] -; GFX10-NEXT: s_waitcnt vmcnt(8) -; GFX10-NEXT: v_max_f64 v[52:53], v[16:17], v[50:51] -; GFX10-NEXT: v_cmp_u_f64_e64 s11, v[16:17], v[50:51] -; GFX10-NEXT: v_max_f64 v[50:51], v[18:19], v[80:81] ; GFX10-NEXT: v_max_f64 v[70:71], v[22:23], v[68:69] ; GFX10-NEXT: v_cmp_u_f64_e64 s14, v[22:23], v[68:69] ; GFX10-NEXT: v_cndmask_b32_e64 v6, v34, 0, s6 @@ -2610,7 +2611,7 @@ define <16 x double> @v_maximum_v16f64(<16 x double> %src0, <16 x double> %src1) ; GFX10-NEXT: v_cndmask_b32_e64 v21, v65, 0x7ff80000, s13 ; GFX10-NEXT: v_cndmask_b32_e64 v22, v70, 0, s14 ; GFX10-NEXT: v_cndmask_b32_e64 v23, v71, 0x7ff80000, s14 -; GFX10-NEXT: s_waitcnt vmcnt(6) +; GFX10-NEXT: s_waitcnt vmcnt(7) ; GFX10-NEXT: v_max_f64 v[68:69], v[24:25], v[66:67] ; GFX10-NEXT: v_cmp_u_f64_e64 s15, v[24:25], v[66:67] ; GFX10-NEXT: s_waitcnt vmcnt(5) @@ -2619,10 +2620,10 @@ define <16 x double> @v_maximum_v16f64(<16 x double> %src0, <16 x double> %src1) ; GFX10-NEXT: s_waitcnt vmcnt(3) ; GFX10-NEXT: v_max_f64 v[80:81], v[28:29], v[2:3] ; GFX10-NEXT: v_cmp_u_f64_e64 s17, v[28:29], v[2:3] +; GFX10-NEXT: v_cndmask_b32_e64 v0, v82, 0, vcc_lo ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_max_f64 v[86:87], v[30:31], v[4:5] ; GFX10-NEXT: v_cmp_u_f64_e64 s18, v[30:31], v[4:5] -; GFX10-NEXT: v_cndmask_b32_e64 v0, v82, 0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v1, v83, 0x7ff80000, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v2, v84, 0, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v85, 0x7ff80000, s4 @@ -2642,7 +2643,6 @@ define <16 x double> @v_maximum_v16f64(<16 x double> %src0, <16 x double> %src1) ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_load_b32 v31, off, s32 ; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:8 ; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 ; GFX11-NEXT: scratch_load_b32 v35, off, s32 offset:16 @@ -2673,51 +2673,52 @@ define <16 x double> @v_maximum_v16f64(<16 x double> %src0, <16 x double> %src1) ; GFX11-NEXT: scratch_load_b32 v82, off, s32 offset:108 ; GFX11-NEXT: scratch_load_b32 v85, off, s32 offset:120 ; GFX11-NEXT: scratch_load_b32 v84, off, s32 offset:116 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 ; GFX11-NEXT: scratch_load_b32 v87, off, s32 offset:128 ; GFX11-NEXT: scratch_load_b32 v86, off, s32 offset:124 -; GFX11-NEXT: s_waitcnt vmcnt(30) +; GFX11-NEXT: s_waitcnt vmcnt(31) ; GFX11-NEXT: v_max_f64 v[96:97], v[0:1], v[32:33] ; GFX11-NEXT: v_cmp_u_f64_e32 vcc_lo, v[0:1], v[32:33] -; GFX11-NEXT: s_waitcnt vmcnt(28) +; GFX11-NEXT: s_waitcnt vmcnt(29) ; GFX11-NEXT: v_max_f64 v[32:33], v[2:3], v[34:35] ; GFX11-NEXT: v_cmp_u_f64_e64 s0, v[2:3], v[34:35] -; GFX11-NEXT: s_waitcnt vmcnt(26) +; GFX11-NEXT: s_waitcnt vmcnt(27) ; GFX11-NEXT: v_max_f64 v[34:35], v[4:5], v[36:37] ; GFX11-NEXT: v_cmp_u_f64_e64 s1, v[4:5], v[36:37] -; GFX11-NEXT: s_waitcnt vmcnt(24) +; GFX11-NEXT: s_waitcnt vmcnt(25) ; GFX11-NEXT: v_max_f64 v[36:37], v[6:7], v[38:39] ; GFX11-NEXT: v_cmp_u_f64_e64 s2, v[6:7], v[38:39] -; GFX11-NEXT: s_waitcnt vmcnt(22) +; GFX11-NEXT: s_waitcnt vmcnt(23) ; GFX11-NEXT: v_max_f64 v[38:39], v[8:9], v[48:49] ; GFX11-NEXT: v_cmp_u_f64_e64 s3, v[8:9], v[48:49] -; GFX11-NEXT: s_waitcnt vmcnt(20) +; GFX11-NEXT: s_waitcnt vmcnt(21) ; GFX11-NEXT: v_max_f64 v[48:49], v[10:11], v[50:51] ; GFX11-NEXT: v_cmp_u_f64_e64 s4, v[10:11], v[50:51] -; GFX11-NEXT: s_waitcnt vmcnt(18) +; GFX11-NEXT: s_waitcnt vmcnt(19) ; GFX11-NEXT: v_max_f64 v[50:51], v[12:13], v[52:53] ; GFX11-NEXT: v_cmp_u_f64_e64 s5, v[12:13], v[52:53] -; GFX11-NEXT: s_waitcnt vmcnt(16) +; GFX11-NEXT: s_waitcnt vmcnt(17) ; GFX11-NEXT: v_max_f64 v[52:53], v[14:15], v[54:55] ; GFX11-NEXT: v_cmp_u_f64_e64 s6, v[14:15], v[54:55] -; GFX11-NEXT: s_waitcnt vmcnt(14) +; GFX11-NEXT: s_waitcnt vmcnt(15) ; GFX11-NEXT: v_max_f64 v[54:55], v[16:17], v[64:65] ; GFX11-NEXT: v_cmp_u_f64_e64 s7, v[16:17], v[64:65] -; GFX11-NEXT: s_waitcnt vmcnt(12) +; GFX11-NEXT: s_waitcnt vmcnt(13) ; GFX11-NEXT: v_max_f64 v[64:65], v[18:19], v[66:67] ; GFX11-NEXT: v_cmp_u_f64_e64 s8, v[18:19], v[66:67] -; GFX11-NEXT: s_waitcnt vmcnt(10) +; GFX11-NEXT: s_waitcnt vmcnt(11) ; GFX11-NEXT: v_max_f64 v[66:67], v[20:21], v[68:69] ; GFX11-NEXT: v_cmp_u_f64_e64 s9, v[20:21], v[68:69] -; GFX11-NEXT: s_waitcnt vmcnt(8) +; GFX11-NEXT: s_waitcnt vmcnt(9) ; GFX11-NEXT: v_max_f64 v[68:69], v[22:23], v[70:71] ; GFX11-NEXT: v_cmp_u_f64_e64 s10, v[22:23], v[70:71] -; GFX11-NEXT: s_waitcnt vmcnt(6) +; GFX11-NEXT: s_waitcnt vmcnt(7) ; GFX11-NEXT: v_max_f64 v[70:71], v[24:25], v[80:81] ; GFX11-NEXT: v_cmp_u_f64_e64 s11, v[24:25], v[80:81] -; GFX11-NEXT: s_waitcnt vmcnt(4) +; GFX11-NEXT: s_waitcnt vmcnt(5) ; GFX11-NEXT: v_max_f64 v[80:81], v[26:27], v[82:83] ; GFX11-NEXT: v_cmp_u_f64_e64 s12, v[26:27], v[82:83] -; GFX11-NEXT: s_waitcnt vmcnt(2) +; GFX11-NEXT: s_waitcnt vmcnt(3) ; GFX11-NEXT: v_max_f64 v[82:83], v[28:29], v[84:85] ; GFX11-NEXT: v_cmp_u_f64_e64 s13, v[28:29], v[84:85] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -2765,7 +2766,6 @@ define <16 x double> @v_maximum_v16f64(<16 x double> %src0, <16 x double> %src1) ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_clause 0x1f -; GFX12-NEXT: scratch_load_b32 v31, off, s32 ; GFX12-NEXT: scratch_load_b32 v33, off, s32 offset:8 ; GFX12-NEXT: scratch_load_b32 v32, off, s32 offset:4 ; GFX12-NEXT: scratch_load_b32 v35, off, s32 offset:16 @@ -2796,37 +2796,38 @@ define <16 x double> @v_maximum_v16f64(<16 x double> %src0, <16 x double> %src1) ; GFX12-NEXT: scratch_load_b32 v82, off, s32 offset:108 ; GFX12-NEXT: scratch_load_b32 v85, off, s32 offset:120 ; GFX12-NEXT: scratch_load_b32 v84, off, s32 offset:116 +; GFX12-NEXT: scratch_load_b32 v31, off, s32 ; GFX12-NEXT: scratch_load_b32 v87, off, s32 offset:128 ; GFX12-NEXT: scratch_load_b32 v86, off, s32 offset:124 -; GFX12-NEXT: s_wait_loadcnt 0x1e +; GFX12-NEXT: s_wait_loadcnt 0x1f ; GFX12-NEXT: v_maximum_f64 v[0:1], v[0:1], v[32:33] -; GFX12-NEXT: s_wait_loadcnt 0x1c +; GFX12-NEXT: s_wait_loadcnt 0x1d ; GFX12-NEXT: v_maximum_f64 v[2:3], v[2:3], v[34:35] -; GFX12-NEXT: s_wait_loadcnt 0x1a +; GFX12-NEXT: s_wait_loadcnt 0x1b ; GFX12-NEXT: v_maximum_f64 v[4:5], v[4:5], v[36:37] -; GFX12-NEXT: s_wait_loadcnt 0x18 +; GFX12-NEXT: s_wait_loadcnt 0x19 ; GFX12-NEXT: v_maximum_f64 v[6:7], v[6:7], v[38:39] -; GFX12-NEXT: s_wait_loadcnt 0x16 +; GFX12-NEXT: s_wait_loadcnt 0x17 ; GFX12-NEXT: v_maximum_f64 v[8:9], v[8:9], v[48:49] -; GFX12-NEXT: s_wait_loadcnt 0x14 +; GFX12-NEXT: s_wait_loadcnt 0x15 ; GFX12-NEXT: v_maximum_f64 v[10:11], v[10:11], v[50:51] -; GFX12-NEXT: s_wait_loadcnt 0x12 +; GFX12-NEXT: s_wait_loadcnt 0x13 ; GFX12-NEXT: v_maximum_f64 v[12:13], v[12:13], v[52:53] -; GFX12-NEXT: s_wait_loadcnt 0x10 +; GFX12-NEXT: s_wait_loadcnt 0x11 ; GFX12-NEXT: v_maximum_f64 v[14:15], v[14:15], v[54:55] -; GFX12-NEXT: s_wait_loadcnt 0xe +; GFX12-NEXT: s_wait_loadcnt 0xf ; GFX12-NEXT: v_maximum_f64 v[16:17], v[16:17], v[64:65] -; GFX12-NEXT: s_wait_loadcnt 0xc +; GFX12-NEXT: s_wait_loadcnt 0xd ; GFX12-NEXT: v_maximum_f64 v[18:19], v[18:19], v[66:67] -; GFX12-NEXT: s_wait_loadcnt 0xa +; GFX12-NEXT: s_wait_loadcnt 0xb ; GFX12-NEXT: v_maximum_f64 v[20:21], v[20:21], v[68:69] -; GFX12-NEXT: s_wait_loadcnt 0x8 +; GFX12-NEXT: s_wait_loadcnt 0x9 ; GFX12-NEXT: v_maximum_f64 v[22:23], v[22:23], v[70:71] -; GFX12-NEXT: s_wait_loadcnt 0x6 +; GFX12-NEXT: s_wait_loadcnt 0x7 ; GFX12-NEXT: v_maximum_f64 v[24:25], v[24:25], v[80:81] -; GFX12-NEXT: s_wait_loadcnt 0x4 +; GFX12-NEXT: s_wait_loadcnt 0x5 ; GFX12-NEXT: v_maximum_f64 v[26:27], v[26:27], v[82:83] -; GFX12-NEXT: s_wait_loadcnt 0x2 +; GFX12-NEXT: s_wait_loadcnt 0x3 ; GFX12-NEXT: v_maximum_f64 v[28:29], v[28:29], v[84:85] ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_maximum_f64 v[30:31], v[30:31], v[86:87] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll index dfd67873c3b86..27d5955fbd9e1 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll @@ -2399,9 +2399,9 @@ define <16 x double> @v_minimum_v16f64(<16 x double> %src0, <16 x double> %src1) ; GFX950-NEXT: scratch_load_dword v50, off, s32 offset:84 ; GFX950-NEXT: scratch_load_dword v49, off, s32 offset:96 ; GFX950-NEXT: scratch_load_dword v48, off, s32 offset:92 -; GFX950-NEXT: scratch_load_dword v31, off, s32 ; GFX950-NEXT: scratch_load_dword v35, off, s32 offset:104 ; GFX950-NEXT: scratch_load_dword v34, off, s32 offset:100 +; GFX950-NEXT: scratch_load_dword v31, off, s32 ; GFX950-NEXT: v_accvgpr_write_b32 a10, v58 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_write_b32 a11, v59 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_write_b32 a12, v60 ; Reload Reuse @@ -2485,7 +2485,7 @@ define <16 x double> @v_minimum_v16f64(<16 x double> %src0, <16 x double> %src1) ; GFX950-NEXT: v_accvgpr_read_b32 v43, a3 ; Reload Reuse ; GFX950-NEXT: v_cndmask_b32_e64 v22, v0, 0, vcc ; GFX950-NEXT: v_cndmask_b32_e32 v23, v1, v2, vcc -; GFX950-NEXT: s_waitcnt vmcnt(6) +; GFX950-NEXT: s_waitcnt vmcnt(7) ; GFX950-NEXT: v_min_f64 v[0:1], v[24:25], v[34:35] ; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[24:25], v[34:35] ; GFX950-NEXT: v_accvgpr_read_b32 v42, a2 ; Reload Reuse @@ -2529,25 +2529,25 @@ define <16 x double> @v_minimum_v16f64(<16 x double> %src0, <16 x double> %src1) ; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12 ; GFX10-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:24 ; GFX10-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:20 -; GFX10-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:36 ; GFX10-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:32 ; GFX10-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:28 -; GFX10-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:68 -; GFX10-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:64 -; GFX10-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:60 -; GFX10-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:56 -; GFX10-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:52 +; GFX10-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:36 +; GFX10-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:40 ; GFX10-NEXT: buffer_load_dword v65, off, s[0:3], s32 offset:48 ; GFX10-NEXT: buffer_load_dword v64, off, s[0:3], s32 offset:44 -; GFX10-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:40 -; GFX10-NEXT: buffer_load_dword v66, off, s[0:3], s32 offset:100 -; GFX10-NEXT: buffer_load_dword v69, off, s[0:3], s32 offset:96 -; GFX10-NEXT: buffer_load_dword v68, off, s[0:3], s32 offset:92 -; GFX10-NEXT: buffer_load_dword v71, off, s[0:3], s32 offset:88 -; GFX10-NEXT: buffer_load_dword v70, off, s[0:3], s32 offset:84 +; GFX10-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:56 +; GFX10-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:52 +; GFX10-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:64 +; GFX10-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:60 +; GFX10-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:68 +; GFX10-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:72 ; GFX10-NEXT: buffer_load_dword v81, off, s[0:3], s32 offset:80 ; GFX10-NEXT: buffer_load_dword v80, off, s[0:3], s32 offset:76 -; GFX10-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:72 +; GFX10-NEXT: buffer_load_dword v71, off, s[0:3], s32 offset:88 +; GFX10-NEXT: buffer_load_dword v70, off, s[0:3], s32 offset:84 +; GFX10-NEXT: buffer_load_dword v69, off, s[0:3], s32 offset:96 +; GFX10-NEXT: buffer_load_dword v68, off, s[0:3], s32 offset:92 +; GFX10-NEXT: buffer_load_dword v66, off, s[0:3], s32 offset:100 ; GFX10-NEXT: s_waitcnt vmcnt(23) ; GFX10-NEXT: v_min_f64 v[82:83], v[0:1], v[31:32] ; GFX10-NEXT: v_cmp_u_f64_e32 vcc_lo, v[0:1], v[31:32] @@ -2558,38 +2558,39 @@ define <16 x double> @v_minimum_v16f64(<16 x double> %src0, <16 x double> %src1) ; GFX10-NEXT: v_min_f64 v[32:33], v[4:5], v[35:36] ; GFX10-NEXT: v_cmp_u_f64_e64 s5, v[4:5], v[35:36] ; GFX10-NEXT: s_clause 0x7 -; GFX10-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; GFX10-NEXT: buffer_load_dword v67, off, s[0:3], s32 offset:104 +; GFX10-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:108 ; GFX10-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; GFX10-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX10-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:128 ; GFX10-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:124 -; GFX10-NEXT: s_waitcnt vmcnt(24) +; GFX10-NEXT: s_waitcnt vmcnt(25) ; GFX10-NEXT: v_min_f64 v[34:35], v[6:7], v[48:49] ; GFX10-NEXT: v_cmp_u_f64_e64 s6, v[6:7], v[48:49] -; GFX10-NEXT: s_waitcnt vmcnt(21) -; GFX10-NEXT: v_cmp_u_f64_e64 s10, v[14:15], v[52:53] -; GFX10-NEXT: s_waitcnt vmcnt(19) -; GFX10-NEXT: v_cmp_u_f64_e64 s9, v[12:13], v[54:55] -; GFX10-NEXT: s_waitcnt vmcnt(17) -; GFX10-NEXT: v_cmp_u_f64_e64 s8, v[10:11], v[64:65] -; GFX10-NEXT: s_waitcnt vmcnt(16) +; GFX10-NEXT: s_waitcnt vmcnt(23) ; GFX10-NEXT: v_min_f64 v[48:49], v[8:9], v[37:38] ; GFX10-NEXT: v_cmp_u_f64_e64 s7, v[8:9], v[37:38] +; GFX10-NEXT: s_waitcnt vmcnt(21) ; GFX10-NEXT: v_min_f64 v[36:37], v[10:11], v[64:65] +; GFX10-NEXT: v_cmp_u_f64_e64 s8, v[10:11], v[64:65] +; GFX10-NEXT: s_waitcnt vmcnt(19) ; GFX10-NEXT: v_min_f64 v[38:39], v[12:13], v[54:55] +; GFX10-NEXT: v_cmp_u_f64_e64 s9, v[12:13], v[54:55] +; GFX10-NEXT: s_waitcnt vmcnt(17) ; GFX10-NEXT: v_min_f64 v[54:55], v[14:15], v[52:53] +; GFX10-NEXT: v_cmp_u_f64_e64 s10, v[14:15], v[52:53] +; GFX10-NEXT: s_waitcnt vmcnt(15) +; GFX10-NEXT: v_min_f64 v[52:53], v[16:17], v[50:51] +; GFX10-NEXT: v_cmp_u_f64_e64 s11, v[16:17], v[50:51] +; GFX10-NEXT: s_waitcnt vmcnt(13) +; GFX10-NEXT: v_min_f64 v[50:51], v[18:19], v[80:81] +; GFX10-NEXT: v_cmp_u_f64_e64 s12, v[18:19], v[80:81] ; GFX10-NEXT: s_waitcnt vmcnt(11) ; GFX10-NEXT: v_min_f64 v[64:65], v[20:21], v[70:71] ; GFX10-NEXT: v_cmp_u_f64_e64 s13, v[20:21], v[70:71] ; GFX10-NEXT: s_waitcnt vmcnt(9) -; GFX10-NEXT: v_cmp_u_f64_e64 s12, v[18:19], v[80:81] -; GFX10-NEXT: s_waitcnt vmcnt(8) -; GFX10-NEXT: v_min_f64 v[52:53], v[16:17], v[50:51] -; GFX10-NEXT: v_cmp_u_f64_e64 s11, v[16:17], v[50:51] -; GFX10-NEXT: v_min_f64 v[50:51], v[18:19], v[80:81] ; GFX10-NEXT: v_min_f64 v[70:71], v[22:23], v[68:69] ; GFX10-NEXT: v_cmp_u_f64_e64 s14, v[22:23], v[68:69] ; GFX10-NEXT: v_cndmask_b32_e64 v6, v34, 0, s6 @@ -2610,7 +2611,7 @@ define <16 x double> @v_minimum_v16f64(<16 x double> %src0, <16 x double> %src1) ; GFX10-NEXT: v_cndmask_b32_e64 v21, v65, 0x7ff80000, s13 ; GFX10-NEXT: v_cndmask_b32_e64 v22, v70, 0, s14 ; GFX10-NEXT: v_cndmask_b32_e64 v23, v71, 0x7ff80000, s14 -; GFX10-NEXT: s_waitcnt vmcnt(6) +; GFX10-NEXT: s_waitcnt vmcnt(7) ; GFX10-NEXT: v_min_f64 v[68:69], v[24:25], v[66:67] ; GFX10-NEXT: v_cmp_u_f64_e64 s15, v[24:25], v[66:67] ; GFX10-NEXT: s_waitcnt vmcnt(5) @@ -2619,10 +2620,10 @@ define <16 x double> @v_minimum_v16f64(<16 x double> %src0, <16 x double> %src1) ; GFX10-NEXT: s_waitcnt vmcnt(3) ; GFX10-NEXT: v_min_f64 v[80:81], v[28:29], v[2:3] ; GFX10-NEXT: v_cmp_u_f64_e64 s17, v[28:29], v[2:3] +; GFX10-NEXT: v_cndmask_b32_e64 v0, v82, 0, vcc_lo ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_min_f64 v[86:87], v[30:31], v[4:5] ; GFX10-NEXT: v_cmp_u_f64_e64 s18, v[30:31], v[4:5] -; GFX10-NEXT: v_cndmask_b32_e64 v0, v82, 0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v1, v83, 0x7ff80000, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v2, v84, 0, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v85, 0x7ff80000, s4 @@ -2642,7 +2643,6 @@ define <16 x double> @v_minimum_v16f64(<16 x double> %src0, <16 x double> %src1) ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_load_b32 v31, off, s32 ; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:8 ; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 ; GFX11-NEXT: scratch_load_b32 v35, off, s32 offset:16 @@ -2673,51 +2673,52 @@ define <16 x double> @v_minimum_v16f64(<16 x double> %src0, <16 x double> %src1) ; GFX11-NEXT: scratch_load_b32 v82, off, s32 offset:108 ; GFX11-NEXT: scratch_load_b32 v85, off, s32 offset:120 ; GFX11-NEXT: scratch_load_b32 v84, off, s32 offset:116 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 ; GFX11-NEXT: scratch_load_b32 v87, off, s32 offset:128 ; GFX11-NEXT: scratch_load_b32 v86, off, s32 offset:124 -; GFX11-NEXT: s_waitcnt vmcnt(30) +; GFX11-NEXT: s_waitcnt vmcnt(31) ; GFX11-NEXT: v_min_f64 v[96:97], v[0:1], v[32:33] ; GFX11-NEXT: v_cmp_u_f64_e32 vcc_lo, v[0:1], v[32:33] -; GFX11-NEXT: s_waitcnt vmcnt(28) +; GFX11-NEXT: s_waitcnt vmcnt(29) ; GFX11-NEXT: v_min_f64 v[32:33], v[2:3], v[34:35] ; GFX11-NEXT: v_cmp_u_f64_e64 s0, v[2:3], v[34:35] -; GFX11-NEXT: s_waitcnt vmcnt(26) +; GFX11-NEXT: s_waitcnt vmcnt(27) ; GFX11-NEXT: v_min_f64 v[34:35], v[4:5], v[36:37] ; GFX11-NEXT: v_cmp_u_f64_e64 s1, v[4:5], v[36:37] -; GFX11-NEXT: s_waitcnt vmcnt(24) +; GFX11-NEXT: s_waitcnt vmcnt(25) ; GFX11-NEXT: v_min_f64 v[36:37], v[6:7], v[38:39] ; GFX11-NEXT: v_cmp_u_f64_e64 s2, v[6:7], v[38:39] -; GFX11-NEXT: s_waitcnt vmcnt(22) +; GFX11-NEXT: s_waitcnt vmcnt(23) ; GFX11-NEXT: v_min_f64 v[38:39], v[8:9], v[48:49] ; GFX11-NEXT: v_cmp_u_f64_e64 s3, v[8:9], v[48:49] -; GFX11-NEXT: s_waitcnt vmcnt(20) +; GFX11-NEXT: s_waitcnt vmcnt(21) ; GFX11-NEXT: v_min_f64 v[48:49], v[10:11], v[50:51] ; GFX11-NEXT: v_cmp_u_f64_e64 s4, v[10:11], v[50:51] -; GFX11-NEXT: s_waitcnt vmcnt(18) +; GFX11-NEXT: s_waitcnt vmcnt(19) ; GFX11-NEXT: v_min_f64 v[50:51], v[12:13], v[52:53] ; GFX11-NEXT: v_cmp_u_f64_e64 s5, v[12:13], v[52:53] -; GFX11-NEXT: s_waitcnt vmcnt(16) +; GFX11-NEXT: s_waitcnt vmcnt(17) ; GFX11-NEXT: v_min_f64 v[52:53], v[14:15], v[54:55] ; GFX11-NEXT: v_cmp_u_f64_e64 s6, v[14:15], v[54:55] -; GFX11-NEXT: s_waitcnt vmcnt(14) +; GFX11-NEXT: s_waitcnt vmcnt(15) ; GFX11-NEXT: v_min_f64 v[54:55], v[16:17], v[64:65] ; GFX11-NEXT: v_cmp_u_f64_e64 s7, v[16:17], v[64:65] -; GFX11-NEXT: s_waitcnt vmcnt(12) +; GFX11-NEXT: s_waitcnt vmcnt(13) ; GFX11-NEXT: v_min_f64 v[64:65], v[18:19], v[66:67] ; GFX11-NEXT: v_cmp_u_f64_e64 s8, v[18:19], v[66:67] -; GFX11-NEXT: s_waitcnt vmcnt(10) +; GFX11-NEXT: s_waitcnt vmcnt(11) ; GFX11-NEXT: v_min_f64 v[66:67], v[20:21], v[68:69] ; GFX11-NEXT: v_cmp_u_f64_e64 s9, v[20:21], v[68:69] -; GFX11-NEXT: s_waitcnt vmcnt(8) +; GFX11-NEXT: s_waitcnt vmcnt(9) ; GFX11-NEXT: v_min_f64 v[68:69], v[22:23], v[70:71] ; GFX11-NEXT: v_cmp_u_f64_e64 s10, v[22:23], v[70:71] -; GFX11-NEXT: s_waitcnt vmcnt(6) +; GFX11-NEXT: s_waitcnt vmcnt(7) ; GFX11-NEXT: v_min_f64 v[70:71], v[24:25], v[80:81] ; GFX11-NEXT: v_cmp_u_f64_e64 s11, v[24:25], v[80:81] -; GFX11-NEXT: s_waitcnt vmcnt(4) +; GFX11-NEXT: s_waitcnt vmcnt(5) ; GFX11-NEXT: v_min_f64 v[80:81], v[26:27], v[82:83] ; GFX11-NEXT: v_cmp_u_f64_e64 s12, v[26:27], v[82:83] -; GFX11-NEXT: s_waitcnt vmcnt(2) +; GFX11-NEXT: s_waitcnt vmcnt(3) ; GFX11-NEXT: v_min_f64 v[82:83], v[28:29], v[84:85] ; GFX11-NEXT: v_cmp_u_f64_e64 s13, v[28:29], v[84:85] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -2765,7 +2766,6 @@ define <16 x double> @v_minimum_v16f64(<16 x double> %src0, <16 x double> %src1) ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_clause 0x1f -; GFX12-NEXT: scratch_load_b32 v31, off, s32 ; GFX12-NEXT: scratch_load_b32 v33, off, s32 offset:8 ; GFX12-NEXT: scratch_load_b32 v32, off, s32 offset:4 ; GFX12-NEXT: scratch_load_b32 v35, off, s32 offset:16 @@ -2796,37 +2796,38 @@ define <16 x double> @v_minimum_v16f64(<16 x double> %src0, <16 x double> %src1) ; GFX12-NEXT: scratch_load_b32 v82, off, s32 offset:108 ; GFX12-NEXT: scratch_load_b32 v85, off, s32 offset:120 ; GFX12-NEXT: scratch_load_b32 v84, off, s32 offset:116 +; GFX12-NEXT: scratch_load_b32 v31, off, s32 ; GFX12-NEXT: scratch_load_b32 v87, off, s32 offset:128 ; GFX12-NEXT: scratch_load_b32 v86, off, s32 offset:124 -; GFX12-NEXT: s_wait_loadcnt 0x1e +; GFX12-NEXT: s_wait_loadcnt 0x1f ; GFX12-NEXT: v_minimum_f64 v[0:1], v[0:1], v[32:33] -; GFX12-NEXT: s_wait_loadcnt 0x1c +; GFX12-NEXT: s_wait_loadcnt 0x1d ; GFX12-NEXT: v_minimum_f64 v[2:3], v[2:3], v[34:35] -; GFX12-NEXT: s_wait_loadcnt 0x1a +; GFX12-NEXT: s_wait_loadcnt 0x1b ; GFX12-NEXT: v_minimum_f64 v[4:5], v[4:5], v[36:37] -; GFX12-NEXT: s_wait_loadcnt 0x18 +; GFX12-NEXT: s_wait_loadcnt 0x19 ; GFX12-NEXT: v_minimum_f64 v[6:7], v[6:7], v[38:39] -; GFX12-NEXT: s_wait_loadcnt 0x16 +; GFX12-NEXT: s_wait_loadcnt 0x17 ; GFX12-NEXT: v_minimum_f64 v[8:9], v[8:9], v[48:49] -; GFX12-NEXT: s_wait_loadcnt 0x14 +; GFX12-NEXT: s_wait_loadcnt 0x15 ; GFX12-NEXT: v_minimum_f64 v[10:11], v[10:11], v[50:51] -; GFX12-NEXT: s_wait_loadcnt 0x12 +; GFX12-NEXT: s_wait_loadcnt 0x13 ; GFX12-NEXT: v_minimum_f64 v[12:13], v[12:13], v[52:53] -; GFX12-NEXT: s_wait_loadcnt 0x10 +; GFX12-NEXT: s_wait_loadcnt 0x11 ; GFX12-NEXT: v_minimum_f64 v[14:15], v[14:15], v[54:55] -; GFX12-NEXT: s_wait_loadcnt 0xe +; GFX12-NEXT: s_wait_loadcnt 0xf ; GFX12-NEXT: v_minimum_f64 v[16:17], v[16:17], v[64:65] -; GFX12-NEXT: s_wait_loadcnt 0xc +; GFX12-NEXT: s_wait_loadcnt 0xd ; GFX12-NEXT: v_minimum_f64 v[18:19], v[18:19], v[66:67] -; GFX12-NEXT: s_wait_loadcnt 0xa +; GFX12-NEXT: s_wait_loadcnt 0xb ; GFX12-NEXT: v_minimum_f64 v[20:21], v[20:21], v[68:69] -; GFX12-NEXT: s_wait_loadcnt 0x8 +; GFX12-NEXT: s_wait_loadcnt 0x9 ; GFX12-NEXT: v_minimum_f64 v[22:23], v[22:23], v[70:71] -; GFX12-NEXT: s_wait_loadcnt 0x6 +; GFX12-NEXT: s_wait_loadcnt 0x7 ; GFX12-NEXT: v_minimum_f64 v[24:25], v[24:25], v[80:81] -; GFX12-NEXT: s_wait_loadcnt 0x4 +; GFX12-NEXT: s_wait_loadcnt 0x5 ; GFX12-NEXT: v_minimum_f64 v[26:27], v[26:27], v[82:83] -; GFX12-NEXT: s_wait_loadcnt 0x2 +; GFX12-NEXT: s_wait_loadcnt 0x3 ; GFX12-NEXT: v_minimum_f64 v[28:29], v[28:29], v[84:85] ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_minimum_f64 v[30:31], v[30:31], v[86:87] diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll index a135b43bad0fe..049c1329422cd 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll @@ -567,39 +567,52 @@ define amdgpu_kernel void @constant_load_v16i16_align2(ptr addrspace(4) %ptr0) # ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 -; GCN-NOHSA-SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:2 -; GCN-NOHSA-SI-NEXT: buffer_load_ushort v4, off, s[0:3], 0 offset:4 -; GCN-NOHSA-SI-NEXT: buffer_load_ushort v2, off, s[0:3], 0 offset:6 -; GCN-NOHSA-SI-NEXT: buffer_load_ushort v5, off, s[0:3], 0 offset:8 -; GCN-NOHSA-SI-NEXT: buffer_load_ushort v3, off, s[0:3], 0 offset:10 -; GCN-NOHSA-SI-NEXT: buffer_load_ushort v6, off, s[0:3], 0 offset:12 ; GCN-NOHSA-SI-NEXT: buffer_load_ushort v7, off, s[0:3], 0 offset:14 -; GCN-NOHSA-SI-NEXT: buffer_load_ushort v8, off, s[0:3], 0 offset:16 -; GCN-NOHSA-SI-NEXT: buffer_load_ushort v9, off, s[0:3], 0 offset:18 -; GCN-NOHSA-SI-NEXT: buffer_load_ushort v10, off, s[0:3], 0 offset:20 -; GCN-NOHSA-SI-NEXT: buffer_load_ushort v11, off, s[0:3], 0 offset:22 -; GCN-NOHSA-SI-NEXT: buffer_load_ushort v12, off, s[0:3], 0 offset:24 +; GCN-NOHSA-SI-NEXT: buffer_load_ushort v3, off, s[0:3], 0 offset:10 +; GCN-NOHSA-SI-NEXT: buffer_load_ushort v2, off, s[0:3], 0 offset:6 +; GCN-NOHSA-SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:2 +; GCN-NOHSA-SI-NEXT: buffer_load_ushort v15, off, s[0:3], 0 offset:30 ; GCN-NOHSA-SI-NEXT: buffer_load_ushort v13, off, s[0:3], 0 offset:26 +; GCN-NOHSA-SI-NEXT: buffer_load_ushort v11, off, s[0:3], 0 offset:22 +; GCN-NOHSA-SI-NEXT: buffer_load_ushort v9, off, s[0:3], 0 offset:18 +; GCN-NOHSA-SI-NEXT: buffer_load_ushort v6, off, s[0:3], 0 offset:12 +; GCN-NOHSA-SI-NEXT: buffer_load_ushort v5, off, s[0:3], 0 offset:8 +; GCN-NOHSA-SI-NEXT: buffer_load_ushort v4, off, s[0:3], 0 offset:4 +; GCN-NOHSA-SI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: buffer_load_ushort v14, off, s[0:3], 0 offset:28 -; GCN-NOHSA-SI-NEXT: buffer_load_ushort v15, off, s[0:3], 0 offset:30 -; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(8) +; GCN-NOHSA-SI-NEXT: buffer_load_ushort v12, off, s[0:3], 0 offset:24 +; GCN-NOHSA-SI-NEXT: buffer_load_ushort v10, off, s[0:3], 0 offset:20 +; GCN-NOHSA-SI-NEXT: buffer_load_ushort v8, off, s[0:3], 0 offset:16 +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(14) ; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v16, 16, v3 +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(13) ; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v17, 16, v2 +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(12) ; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v18, 16, v1 -; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(11) ; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(10) ; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(9) ; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(8) ; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(7) ; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v3, v7, v6 +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(6) ; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v2, v16, v5 +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(5) ; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v1, v17, v4 +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(4) ; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v0, v18, v0 +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(3) ; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v7, v15, v14 +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(2) ; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v6, v13, v12 +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(1) ; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v5, v11, v10 +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v4, v9, v8 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 @@ -774,25 +787,24 @@ define amdgpu_kernel void @constant_load_v16i16_align2(ptr addrspace(4) %ptr0) # ; GFX12-TRUE16-NEXT: global_load_d16_b16 v2, v8, s[0:1] offset:8 ; GFX12-TRUE16-NEXT: global_load_d16_b16 v1, v8, s[0:1] offset:4 ; GFX12-TRUE16-NEXT: global_load_d16_b16 v0, v8, s[0:1] -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x7 -; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v3, v8, s[0:1] offset:14 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x7 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x6 ; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v7, v8, s[0:1] offset:30 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x7 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x6 ; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v6, v8, s[0:1] offset:26 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x7 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x6 ; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v5, v8, s[0:1] offset:22 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x7 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x6 ; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v4, v8, s[0:1] offset:18 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x7 -; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v2, v8, s[0:1] offset:10 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x7 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x5 ; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v1, v8, s[0:1] offset:6 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x7 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x5 +; GFX12-TRUE16-NEXT: s_clause 0x2 ; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v0, v8, s[0:1] offset:2 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: s_clause 0x1 +; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v3, v8, s[0:1] offset:14 +; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v2, v8, s[0:1] offset:10 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x2 ; GFX12-TRUE16-NEXT: global_store_b128 v[0:1], v[4:7], off +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_store_b128 v[0:1], v[0:3], off ; GFX12-TRUE16-NEXT: s_endpgm ; @@ -810,25 +822,24 @@ define amdgpu_kernel void @constant_load_v16i16_align2(ptr addrspace(4) %ptr0) # ; GFX12-FAKE16-NEXT: global_load_u16 v2, v8, s[0:1] offset:8 ; GFX12-FAKE16-NEXT: global_load_u16 v1, v8, s[0:1] offset:4 ; GFX12-FAKE16-NEXT: global_load_u16 v0, v8, s[0:1] -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x7 -; GFX12-FAKE16-NEXT: global_load_d16_hi_b16 v3, v8, s[0:1] offset:14 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x7 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x6 ; GFX12-FAKE16-NEXT: global_load_d16_hi_b16 v7, v8, s[0:1] offset:30 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x7 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x6 ; GFX12-FAKE16-NEXT: global_load_d16_hi_b16 v6, v8, s[0:1] offset:26 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x7 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x6 ; GFX12-FAKE16-NEXT: global_load_d16_hi_b16 v5, v8, s[0:1] offset:22 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x7 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x6 ; GFX12-FAKE16-NEXT: global_load_d16_hi_b16 v4, v8, s[0:1] offset:18 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x7 -; GFX12-FAKE16-NEXT: global_load_d16_hi_b16 v2, v8, s[0:1] offset:10 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x7 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x5 ; GFX12-FAKE16-NEXT: global_load_d16_hi_b16 v1, v8, s[0:1] offset:6 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x7 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x5 +; GFX12-FAKE16-NEXT: s_clause 0x2 ; GFX12-FAKE16-NEXT: global_load_d16_hi_b16 v0, v8, s[0:1] offset:2 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-FAKE16-NEXT: s_clause 0x1 +; GFX12-FAKE16-NEXT: global_load_d16_hi_b16 v3, v8, s[0:1] offset:14 +; GFX12-FAKE16-NEXT: global_load_d16_hi_b16 v2, v8, s[0:1] offset:10 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x2 ; GFX12-FAKE16-NEXT: global_store_b128 v[0:1], v[4:7], off +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: global_store_b128 v[0:1], v[0:3], off ; GFX12-FAKE16-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/load-global-f32.ll b/llvm/test/CodeGen/AMDGPU/load-global-f32.ll index c119ef274bb04..3edba28348bf4 100644 --- a/llvm/test/CodeGen/AMDGPU/load-global-f32.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-f32.ll @@ -712,8 +712,8 @@ define amdgpu_kernel void @global_load_v16f32(ptr addrspace(1) %out, ptr addrspa ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[8:9] +; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s4 ; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[12:13] @@ -734,8 +734,9 @@ define amdgpu_kernel void @global_load_v16f32(ptr addrspace(1) %out, ptr addrspa ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s0 -; GCN-HSA-NEXT: s_waitcnt vmcnt(2) +; GCN-HSA-NEXT: s_waitcnt vmcnt(3) ; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11] +; GCN-HSA-NEXT: s_waitcnt vmcnt(3) ; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7] ; GCN-HSA-NEXT: s_waitcnt vmcnt(3) ; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[12:15] diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll index bca39d06e941c..7373a2bc880c1 100644 --- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll @@ -650,39 +650,52 @@ define amdgpu_kernel void @global_load_v16i16_align2(ptr addrspace(1) %in, ptr a ; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s5 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s6 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s7 -; GCN-NOHSA-SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 -; GCN-NOHSA-SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 offset:2 -; GCN-NOHSA-SI-NEXT: buffer_load_ushort v4, off, s[8:11], 0 offset:4 -; GCN-NOHSA-SI-NEXT: buffer_load_ushort v2, off, s[8:11], 0 offset:6 -; GCN-NOHSA-SI-NEXT: buffer_load_ushort v5, off, s[8:11], 0 offset:8 -; GCN-NOHSA-SI-NEXT: buffer_load_ushort v3, off, s[8:11], 0 offset:10 -; GCN-NOHSA-SI-NEXT: buffer_load_ushort v6, off, s[8:11], 0 offset:12 ; GCN-NOHSA-SI-NEXT: buffer_load_ushort v7, off, s[8:11], 0 offset:14 -; GCN-NOHSA-SI-NEXT: buffer_load_ushort v8, off, s[8:11], 0 offset:16 -; GCN-NOHSA-SI-NEXT: buffer_load_ushort v9, off, s[8:11], 0 offset:18 -; GCN-NOHSA-SI-NEXT: buffer_load_ushort v10, off, s[8:11], 0 offset:20 -; GCN-NOHSA-SI-NEXT: buffer_load_ushort v11, off, s[8:11], 0 offset:22 -; GCN-NOHSA-SI-NEXT: buffer_load_ushort v12, off, s[8:11], 0 offset:24 +; GCN-NOHSA-SI-NEXT: buffer_load_ushort v3, off, s[8:11], 0 offset:10 +; GCN-NOHSA-SI-NEXT: buffer_load_ushort v2, off, s[8:11], 0 offset:6 +; GCN-NOHSA-SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 offset:2 +; GCN-NOHSA-SI-NEXT: buffer_load_ushort v15, off, s[8:11], 0 offset:30 ; GCN-NOHSA-SI-NEXT: buffer_load_ushort v13, off, s[8:11], 0 offset:26 +; GCN-NOHSA-SI-NEXT: buffer_load_ushort v11, off, s[8:11], 0 offset:22 +; GCN-NOHSA-SI-NEXT: buffer_load_ushort v9, off, s[8:11], 0 offset:18 +; GCN-NOHSA-SI-NEXT: buffer_load_ushort v6, off, s[8:11], 0 offset:12 +; GCN-NOHSA-SI-NEXT: buffer_load_ushort v5, off, s[8:11], 0 offset:8 +; GCN-NOHSA-SI-NEXT: buffer_load_ushort v4, off, s[8:11], 0 offset:4 +; GCN-NOHSA-SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 ; GCN-NOHSA-SI-NEXT: buffer_load_ushort v14, off, s[8:11], 0 offset:28 -; GCN-NOHSA-SI-NEXT: buffer_load_ushort v15, off, s[8:11], 0 offset:30 -; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(8) +; GCN-NOHSA-SI-NEXT: buffer_load_ushort v12, off, s[8:11], 0 offset:24 +; GCN-NOHSA-SI-NEXT: buffer_load_ushort v10, off, s[8:11], 0 offset:20 +; GCN-NOHSA-SI-NEXT: buffer_load_ushort v8, off, s[8:11], 0 offset:16 +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(14) ; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v16, 16, v3 +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(13) ; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v17, 16, v2 +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(12) ; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v18, 16, v1 -; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(11) ; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(10) ; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(9) ; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(8) ; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(7) ; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v3, v7, v6 +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(6) ; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v2, v16, v5 +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(5) ; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v1, v17, v4 +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(4) ; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v0, v18, v0 +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(3) ; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v7, v15, v14 +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(2) ; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v6, v13, v12 +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(1) ; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v5, v11, v10 +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v4, v9, v8 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 @@ -4351,15 +4364,15 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s7 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, s2 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, s3 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:112 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:96 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:80 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:64 ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[8:11], off, s[4:7], 0 ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[32:35], off, s[4:7], 0 offset:16 ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[36:39], off, s[4:7], 0 offset:32 ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[40:43], off, s[4:7], 0 offset:48 -; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(3) +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:64 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:80 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:96 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:112 +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(7) ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v3, 16, v11 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v1, 16, v10 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v2, v11, 0, 16 @@ -4372,7 +4385,7 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v5, 16, v8 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v6, v9, 0, 16 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v4, v8, 0, 16 -; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(6) +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(10) ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v11, 16, v35 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v9, 16, v34 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v10, v35, 0, 16 @@ -4381,7 +4394,7 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v13, 16, v32 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v14, v33, 0, 16 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v12, v32, 0, 16 -; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(5) +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(9) ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v35, 16, v39 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v33, 16, v38 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v34, v39, 0, 16 @@ -4390,7 +4403,7 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v45, 16, v36 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v46, v37, 0, 16 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v44, v36, 0, 16 -; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(4) +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(8) ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v39, 16, v43 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v37, 16, v42 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v38, v43, 0, 16 @@ -4399,6 +4412,7 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v49, 16, v40 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v50, v41, 0, 16 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v48, v40, 0, 16 +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(7) ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v43, 16, v31 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v41, 16, v30 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v42, v31, 0, 16 @@ -4407,6 +4421,7 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v53, 16, v28 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v54, v29, 0, 16 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v52, v28, 0, 16 +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(6) ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v30, 16, v27 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v28, 16, v26 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v29, v27, 0, 16 @@ -4415,6 +4430,7 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v57, 16, v24 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v58, v25, 0, 16 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v56, v24, 0, 16 +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(5) ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v26, 16, v23 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 16, v22 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v25, v23, 0, 16 @@ -4423,6 +4439,7 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v61, 16, v20 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v62, v21, 0, 16 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v60, v20, 0, 16 +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(4) ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v22, 16, v19 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v20, 16, v18 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v21, v19, 0, 16 @@ -8115,8 +8132,8 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:32 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(3) ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v20, 16, v14 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v23, v15 @@ -8124,10 +8141,11 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(2) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v27, v3 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v16, 16, v0 -; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(1) ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v17, 16, v6 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v22, v7 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v18, 16, v4 +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v19, 16, v10 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v23, v23, 0, 16 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v26, 31, v15 diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll index e55fb2cac0985..a759b11785b29 100644 --- a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll @@ -2755,8 +2755,8 @@ define amdgpu_kernel void @global_zextload_v16i32_to_v16i64(ptr addrspace(1) %ou ; SI-NOHSA-NEXT: v_mov_b32_e32 v7, v5 ; SI-NOHSA-NEXT: s_mov_b32 s0, s4 ; SI-NOHSA-NEXT: s_mov_b32 s1, s5 -; SI-NOHSA-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:16 +; SI-NOHSA-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(3) ; SI-NOHSA-NEXT: v_mov_b32_e32 v4, v0 ; SI-NOHSA-NEXT: v_mov_b32_e32 v6, v1 @@ -2773,7 +2773,7 @@ define amdgpu_kernel void @global_zextload_v16i32_to_v16i64(ptr addrspace(1) %ou ; SI-NOHSA-NEXT: v_mov_b32_e32 v4, v10 ; SI-NOHSA-NEXT: v_mov_b32_e32 v6, v11 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:80 -; SI-NOHSA-NEXT: s_waitcnt vmcnt(4) expcnt(0) +; SI-NOHSA-NEXT: s_waitcnt vmcnt(5) expcnt(0) ; SI-NOHSA-NEXT: v_mov_b32_e32 v4, v16 ; SI-NOHSA-NEXT: v_mov_b32_e32 v6, v17 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:32 @@ -2781,7 +2781,7 @@ define amdgpu_kernel void @global_zextload_v16i32_to_v16i64(ptr addrspace(1) %ou ; SI-NOHSA-NEXT: v_mov_b32_e32 v4, v18 ; SI-NOHSA-NEXT: v_mov_b32_e32 v6, v19 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:48 -; SI-NOHSA-NEXT: s_waitcnt expcnt(0) +; SI-NOHSA-NEXT: s_waitcnt vmcnt(6) expcnt(0) ; SI-NOHSA-NEXT: v_mov_b32_e32 v4, v12 ; SI-NOHSA-NEXT: v_mov_b32_e32 v6, v13 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 @@ -3906,15 +3906,15 @@ define amdgpu_kernel void @global_zextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; SI-NOHSA-NEXT: s_mov_b32 s1, s5 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:112 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:96 -; SI-NOHSA-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 -; SI-NOHSA-NEXT: buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:16 -; SI-NOHSA-NEXT: buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:32 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:48 +; SI-NOHSA-NEXT: buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:32 +; SI-NOHSA-NEXT: buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:16 +; SI-NOHSA-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(5) ; SI-NOHSA-NEXT: v_mov_b32_e32 v0, v4 ; SI-NOHSA-NEXT: v_mov_b32_e32 v2, v5 -; SI-NOHSA-NEXT: buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:64 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[32:35], off, s[8:11], 0 offset:80 +; SI-NOHSA-NEXT: buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:64 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 ; SI-NOHSA-NEXT: s_waitcnt expcnt(0) ; SI-NOHSA-NEXT: v_mov_b32_e32 v0, v6 @@ -3928,7 +3928,7 @@ define amdgpu_kernel void @global_zextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; SI-NOHSA-NEXT: v_mov_b32_e32 v0, v10 ; SI-NOHSA-NEXT: v_mov_b32_e32 v2, v11 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:208 -; SI-NOHSA-NEXT: s_waitcnt vmcnt(4) expcnt(0) +; SI-NOHSA-NEXT: s_waitcnt vmcnt(5) expcnt(0) ; SI-NOHSA-NEXT: v_mov_b32_e32 v0, v32 ; SI-NOHSA-NEXT: v_mov_b32_e32 v2, v33 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160 @@ -3936,7 +3936,7 @@ define amdgpu_kernel void @global_zextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; SI-NOHSA-NEXT: v_mov_b32_e32 v0, v34 ; SI-NOHSA-NEXT: v_mov_b32_e32 v2, v35 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176 -; SI-NOHSA-NEXT: s_waitcnt expcnt(0) +; SI-NOHSA-NEXT: s_waitcnt vmcnt(6) expcnt(0) ; SI-NOHSA-NEXT: v_mov_b32_e32 v0, v28 ; SI-NOHSA-NEXT: v_mov_b32_e32 v2, v29 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128 @@ -4021,8 +4021,8 @@ define amdgpu_kernel void @global_zextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s7 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s6 -; GCNX3-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[8:11], v[8:9] +; GCNX3-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v3, v1 @@ -4117,7 +4117,7 @@ define amdgpu_kernel void @global_zextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v14 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v15 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[12:13], v[0:3] -; GCNX3-HSA-NEXT: s_waitcnt vmcnt(12) +; GCNX3-HSA-NEXT: s_waitcnt vmcnt(13) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v8 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v9 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s3 @@ -4133,6 +4133,7 @@ define amdgpu_kernel void @global_zextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; GCNX3-HSA-NEXT: s_add_u32 s0, s0, 48 +; GCNX3-HSA-NEXT: s_waitcnt vmcnt(14) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v4 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v5 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3 @@ -4483,25 +4484,29 @@ define amdgpu_kernel void @global_load_v32i32(ptr addrspace(1) %out, ptr addrspa ; SI-NOHSA-NEXT: s_mov_b32 s5, s1 ; SI-NOHSA-NEXT: s_mov_b32 s8, s2 ; SI-NOHSA-NEXT: s_mov_b32 s9, s3 -; SI-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; SI-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 -; SI-NOHSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:112 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:96 -; SI-NOHSA-NEXT: buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:80 +; SI-NOHSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:112 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:64 +; SI-NOHSA-NEXT: buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:80 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:32 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:48 -; SI-NOHSA-NEXT: s_waitcnt vmcnt(4) +; SI-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; SI-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 +; SI-NOHSA-NEXT: s_waitcnt vmcnt(7) ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 offset:96 +; SI-NOHSA-NEXT: s_waitcnt vmcnt(7) ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:112 -; SI-NOHSA-NEXT: s_waitcnt vmcnt(4) +; SI-NOHSA-NEXT: s_waitcnt vmcnt(7) ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 offset:64 +; SI-NOHSA-NEXT: s_waitcnt vmcnt(7) ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 offset:80 -; SI-NOHSA-NEXT: s_waitcnt vmcnt(5) +; SI-NOHSA-NEXT: s_waitcnt vmcnt(7) ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 offset:32 -; SI-NOHSA-NEXT: s_waitcnt vmcnt(5) +; SI-NOHSA-NEXT: s_waitcnt vmcnt(7) ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 offset:48 +; SI-NOHSA-NEXT: s_waitcnt vmcnt(7) ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; SI-NOHSA-NEXT: s_waitcnt vmcnt(7) ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16 ; SI-NOHSA-NEXT: s_endpgm ; @@ -4610,21 +4615,23 @@ define amdgpu_kernel void @global_load_v32i32(ptr addrspace(1) %out, ptr addrspa ; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2 ; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3 -; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:112 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:96 -; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:80 +; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:112 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:64 +; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:80 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:32 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:48 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[24:27], off, s[8:11], 0 ; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:16 ; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0 ; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1 -; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(6) +; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(7) ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:96 +; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(7) ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:112 -; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(6) +; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(7) ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 offset:64 +; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(7) ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:80 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(7) ; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 offset:32 diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i8.ll b/llvm/test/CodeGen/AMDGPU/load-global-i8.ll index f879dc660203f..385902f2d707e 100644 --- a/llvm/test/CodeGen/AMDGPU/load-global-i8.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i8.ll @@ -7527,16 +7527,17 @@ define amdgpu_kernel void @global_zextload_v32i8_to_v32i64(ptr addrspace(1) %out ; GCN-NOHSA-SI-NEXT: buffer_store_dword v15, off, s[12:15], 0 offset:52 ; 4-byte Folded Spill ; GCN-NOHSA-SI-NEXT: buffer_store_dword v16, off, s[12:15], 0 offset:56 ; 4-byte Folded Spill ; GCN-NOHSA-SI-NEXT: buffer_store_dword v17, off, s[12:15], 0 offset:60 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: buffer_load_dword v12, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: buffer_load_dword v13, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(3) ; GCN-NOHSA-SI-NEXT: buffer_load_dword v14, off, s[12:15], 0 offset:24 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: buffer_load_dword v12, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(2) ; GCN-NOHSA-SI-NEXT: buffer_load_dword v15, off, s[12:15], 0 offset:28 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(1) expcnt(1) +; GCN-NOHSA-SI-NEXT: buffer_load_dword v13, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(3) expcnt(1) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, v14 +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(2) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, v12 -; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(1) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, v9 ; GCN-NOHSA-SI-NEXT: buffer_load_dword v53, off, s[12:15], 0 ; 4-byte Folded Reload ; GCN-NOHSA-SI-NEXT: buffer_load_dword v54, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload diff --git a/llvm/test/CodeGen/AMDGPU/load-local-i16.ll b/llvm/test/CodeGen/AMDGPU/load-local-i16.ll index bd191a37582c0..25cfa24c1ddd6 100644 --- a/llvm/test/CodeGen/AMDGPU/load-local-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-local-i16.ll @@ -7311,8 +7311,8 @@ define amdgpu_kernel void @local_zextload_v32i16_to_v32i64(ptr addrspace(3) %out ; SI-NEXT: s_waitcnt lgkmcnt(1) ; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v5 ; SI-NEXT: v_and_b32_e32 v20, 0xffff, v5 -; SI-NEXT: ds_read2_b64 v[10:13], v0 offset0:4 offset1:5 ; SI-NEXT: ds_read2_b64 v[14:17], v0 offset0:6 offset1:7 +; SI-NEXT: ds_read2_b64 v[10:13], v0 offset0:4 offset1:5 ; SI-NEXT: ds_write2_b64 v22, v[20:21], v[18:19] offset0:14 offset1:15 ; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v3 ; SI-NEXT: v_and_b32_e32 v20, 0xffff, v3 @@ -7324,7 +7324,7 @@ define amdgpu_kernel void @local_zextload_v32i16_to_v32i64(ptr addrspace(3) %out ; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v7 ; SI-NEXT: v_and_b32_e32 v20, 0xffff, v7 ; SI-NEXT: ds_write2_b64 v22, v[20:21], v[18:19] offset0:2 offset1:3 -; SI-NEXT: s_waitcnt lgkmcnt(4) +; SI-NEXT: s_waitcnt lgkmcnt(5) ; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v17 ; SI-NEXT: v_and_b32_e32 v20, 0xffff, v17 ; SI-NEXT: ds_write2_b64 v22, v[20:21], v[18:19] offset0:30 offset1:31 @@ -7333,6 +7333,7 @@ define amdgpu_kernel void @local_zextload_v32i16_to_v32i64(ptr addrspace(3) %out ; SI-NEXT: v_mov_b32_e32 v20, v1 ; SI-NEXT: v_and_b32_e32 v19, 0xffff, v15 ; SI-NEXT: ds_write2_b64 v22, v[19:20], v[17:18] offset0:26 offset1:27 +; SI-NEXT: s_waitcnt lgkmcnt(6) ; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v13 ; SI-NEXT: v_and_b32_e32 v19, 0xffff, v13 ; SI-NEXT: ds_write2_b64 v22, v[19:20], v[17:18] offset0:22 offset1:23 @@ -7464,12 +7465,12 @@ define amdgpu_kernel void @local_zextload_v32i16_to_v32i64(ptr addrspace(3) %out ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v21, v5 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v4, s1 -; GFX9-NO-DS128-NEXT: ds_read2_b64 v[6:9], v4 offset0:4 offset1:5 ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[0:3], v4 offset0:6 offset1:7 +; GFX9-NO-DS128-NEXT: ds_read2_b64 v[6:9], v4 offset0:4 offset1:5 ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v22, s0 -; GFX9-NO-DS128-NEXT: ds_read2_b64 v[10:13], v4 offset1:1 ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[14:17], v4 offset0:2 offset1:3 -; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(2) +; GFX9-NO-DS128-NEXT: ds_read2_b64 v[10:13], v4 offset1:1 +; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(3) ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v20, 16, v2 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v18, 0xffff, v2 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v22, v[18:19], v[20:21] offset0:28 offset1:29 @@ -7480,6 +7481,7 @@ define amdgpu_kernel void @local_zextload_v32i16_to_v32i64(ptr addrspace(3) %out ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v1, 0xffff, v0 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v18, 16, v0 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v22, v[1:2], v[18:19] offset0:24 offset1:25 +; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(5) ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v9 ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v18, 16, v9 @@ -7496,7 +7498,7 @@ define amdgpu_kernel void @local_zextload_v32i16_to_v32i64(ptr addrspace(3) %out ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v7, v5 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v22, v[6:7], v[0:1] offset0:16 offset1:17 -; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(7) +; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(8) ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v17 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v17 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v22, v[6:7], v[0:1] offset0:14 offset1:15 @@ -7508,6 +7510,7 @@ define amdgpu_kernel void @local_zextload_v32i16_to_v32i64(ptr addrspace(3) %out ; GFX9-NO-DS128-NEXT: ds_write2_b64 v22, v[6:7], v[0:1] offset0:10 offset1:11 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v14 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v14 +; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(10) ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v10 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v8, 0xffff, v10 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v22, v[6:7], v[1:2] offset0:8 offset1:9 diff --git a/llvm/test/CodeGen/AMDGPU/load-local-redundant-copies.ll b/llvm/test/CodeGen/AMDGPU/load-local-redundant-copies.ll index c9615f478e5b5..088fe562dc24f 100644 --- a/llvm/test/CodeGen/AMDGPU/load-local-redundant-copies.ll +++ b/llvm/test/CodeGen/AMDGPU/load-local-redundant-copies.ll @@ -11,13 +11,14 @@ define amdgpu_vs void @test(ptr addrspace(8) inreg %arg1, ptr addrspace(3) %arg2 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, 8, v0 ; CHECK-NEXT: v_add_i32_e32 v4, vcc, 4, v0 ; CHECK-NEXT: s_mov_b32 m0, -1 +; CHECK-NEXT: ds_read_b32 v0, v0 ; CHECK-NEXT: ds_read_b32 v3, v1 ; CHECK-NEXT: ds_read_b32 v2, v2 ; CHECK-NEXT: ds_read_b32 v1, v4 -; CHECK-NEXT: ds_read_b32 v0, v0 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_waitcnt lgkmcnt(3) ; CHECK-NEXT: exp mrt0 off, off, off, off ; CHECK-NEXT: v_mov_b32_e32 v4, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_FLOAT] idxen ; CHECK-NEXT: s_endpgm call void @llvm.amdgcn.exp.f32(i32 0, i32 0, float poison, float poison, float poison, float poison, i1 false, i1 false) @@ -38,12 +39,12 @@ define amdgpu_vs void @test_2(ptr addrspace(8) inreg %arg1, i32 %arg2, i32 inreg ; CHECK-NEXT: v_add_i32_e32 v8, vcc, 8, v1 ; CHECK-NEXT: v_add_i32_e32 v10, vcc, 4, v1 ; CHECK-NEXT: s_mov_b32 m0, -1 -; CHECK-NEXT: ds_read_b32 v2, v2 -; CHECK-NEXT: ds_read_b32 v5, v4 -; CHECK-NEXT: ds_read_b32 v4, v6 ; CHECK-NEXT: ds_read_b32 v9, v7 ; CHECK-NEXT: ds_read_b32 v8, v8 ; CHECK-NEXT: ds_read_b32 v7, v10 +; CHECK-NEXT: ds_read_b32 v2, v2 +; CHECK-NEXT: ds_read_b32 v5, v4 +; CHECK-NEXT: ds_read_b32 v4, v6 ; CHECK-NEXT: ds_read_b32 v6, v1 ; CHECK-NEXT: ds_read_b32 v3, v3 ; CHECK-NEXT: s_waitcnt lgkmcnt(1) @@ -78,26 +79,28 @@ define amdgpu_vs void @test_3(i32 inreg %arg1, i32 inreg %arg2, ptr addrspace(8) ; CHECK-NEXT: ds_read_b32 v6, v0 ; CHECK-NEXT: ds_read_b32 v5, v3 ; CHECK-NEXT: ds_read_b32 v4, v4 +; CHECK-NEXT: ds_read_b32 v3, v1 ; CHECK-NEXT: ds_read_b32 v8, v7 ; CHECK-NEXT: ds_read_b32 v7, v9 -; CHECK-NEXT: ds_read_b32 v3, v1 ; CHECK-NEXT: v_add_i32_e32 v0, vcc, 4, v2 ; CHECK-NEXT: v_add_i32_e32 v1, vcc, 20, v2 ; CHECK-NEXT: v_add_i32_e32 v9, vcc, 16, v2 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_waitcnt lgkmcnt(2) ; CHECK-NEXT: tbuffer_store_format_xyzw v[3:6], v10, s[4:7], s1 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] idxen offset:264 glc slc +; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: tbuffer_store_format_xy v[7:8], v10, s[4:7], s1 format:[BUF_DATA_FORMAT_INVALID,BUF_NUM_FORMAT_UINT] idxen offset:280 glc slc ; CHECK-NEXT: s_waitcnt expcnt(1) ; CHECK-NEXT: ds_read_b32 v5, v11 ; CHECK-NEXT: ds_read_b32 v4, v12 ; CHECK-NEXT: ds_read_b32 v3, v0 -; CHECK-NEXT: ds_read_b32 v1, v1 ; CHECK-NEXT: ds_read_b32 v0, v9 ; CHECK-NEXT: ds_read_b32 v2, v2 -; CHECK-NEXT: s_waitcnt lgkmcnt(1) +; CHECK-NEXT: ds_read_b32 v1, v1 +; CHECK-NEXT: s_waitcnt lgkmcnt(2) ; CHECK-NEXT: exp mrt0 off, off, off, off -; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_waitcnt lgkmcnt(1) ; CHECK-NEXT: tbuffer_store_format_xyzw v[2:5], v10, s[4:7], s1 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] idxen offset:240 glc slc +; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: tbuffer_store_format_xy v[0:1], v10, s[4:7], s1 format:[BUF_DATA_FORMAT_INVALID,BUF_NUM_FORMAT_UINT] idxen offset:256 glc slc ; CHECK-NEXT: s_endpgm %load1 = load <6 x float>, ptr addrspace(3) %arg5, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/load-local.128.ll b/llvm/test/CodeGen/AMDGPU/load-local.128.ll index 5b6af7654f7e9..302cf003042d9 100644 --- a/llvm/test/CodeGen/AMDGPU/load-local.128.ll +++ b/llvm/test/CodeGen/AMDGPU/load-local.128.ll @@ -96,35 +96,38 @@ define <4 x i32> @load_lds_v4i32_align1(ptr addrspace(3) %ptr) { ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: ds_read_u8 v1, v0 offset:1 -; GFX7-NEXT: ds_read_u8 v2, v0 offset:6 -; GFX7-NEXT: ds_read_u8 v3, v0 offset:4 -; GFX7-NEXT: ds_read_u8 v4, v0 offset:2 ; GFX7-NEXT: ds_read_u8 v5, v0 ; GFX7-NEXT: ds_read_u8 v6, v0 offset:3 +; GFX7-NEXT: ds_read_u8 v4, v0 offset:2 ; GFX7-NEXT: ds_read_u8 v7, v0 offset:5 +; GFX7-NEXT: ds_read_u8 v3, v0 offset:4 ; GFX7-NEXT: ds_read_u8 v8, v0 offset:7 +; GFX7-NEXT: ds_read_u8 v2, v0 offset:6 ; GFX7-NEXT: s_waitcnt lgkmcnt(7) ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX7-NEXT: s_waitcnt lgkmcnt(3) +; GFX7-NEXT: s_waitcnt lgkmcnt(6) ; GFX7-NEXT: v_or_b32_e32 v1, v1, v5 -; GFX7-NEXT: s_waitcnt lgkmcnt(2) +; GFX7-NEXT: s_waitcnt lgkmcnt(5) ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v6 +; GFX7-NEXT: s_waitcnt lgkmcnt(4) ; GFX7-NEXT: v_or_b32_e32 v4, v5, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_or_b32_e32 v4, v4, v1 -; GFX7-NEXT: s_waitcnt lgkmcnt(1) +; GFX7-NEXT: s_waitcnt lgkmcnt(3) ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v7 +; GFX7-NEXT: s_waitcnt lgkmcnt(2) ; GFX7-NEXT: v_or_b32_e32 v1, v1, v3 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(1) ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v8 ; GFX7-NEXT: ds_read_u8 v5, v0 offset:9 ; GFX7-NEXT: ds_read_u8 v6, v0 offset:11 +; GFX7-NEXT: ds_read_u8 v11, v0 offset:10 ; GFX7-NEXT: ds_read_u8 v7, v0 offset:13 +; GFX7-NEXT: ds_read_u8 v10, v0 offset:12 ; GFX7-NEXT: ds_read_u8 v8, v0 offset:15 ; GFX7-NEXT: ds_read_u8 v9, v0 offset:14 -; GFX7-NEXT: ds_read_u8 v10, v0 offset:12 -; GFX7-NEXT: ds_read_u8 v11, v0 offset:10 ; GFX7-NEXT: ds_read_u8 v0, v0 offset:8 +; GFX7-NEXT: s_waitcnt lgkmcnt(8) ; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_or_b32_e32 v1, v2, v1 @@ -332,25 +335,27 @@ define <4 x i32> @load_lds_v4i32_align2(ptr addrspace(3) %ptr) { ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: ds_read_u16 v1, v0 offset:2 -; GFX7-NEXT: ds_read_u16 v3, v0 offset:12 -; GFX7-NEXT: ds_read_u16 v2, v0 offset:8 -; GFX7-NEXT: ds_read_u16 v4, v0 offset:4 ; GFX7-NEXT: ds_read_u16 v5, v0 ; GFX7-NEXT: ds_read_u16 v6, v0 offset:6 +; GFX7-NEXT: ds_read_u16 v4, v0 offset:4 ; GFX7-NEXT: ds_read_u16 v7, v0 offset:10 +; GFX7-NEXT: ds_read_u16 v2, v0 offset:8 ; GFX7-NEXT: ds_read_u16 v8, v0 offset:14 +; GFX7-NEXT: ds_read_u16 v3, v0 offset:12 ; GFX7-NEXT: s_waitcnt lgkmcnt(7) ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX7-NEXT: s_waitcnt lgkmcnt(3) -; GFX7-NEXT: v_or_b32_e32 v0, v0, v5 -; GFX7-NEXT: s_waitcnt lgkmcnt(2) +; GFX7-NEXT: s_waitcnt lgkmcnt(5) ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v6 +; GFX7-NEXT: s_waitcnt lgkmcnt(4) ; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 -; GFX7-NEXT: s_waitcnt lgkmcnt(1) +; GFX7-NEXT: s_waitcnt lgkmcnt(3) ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v7 +; GFX7-NEXT: s_waitcnt lgkmcnt(2) ; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(1) ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_or_b32_e32 v3, v4, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/load-local.96.ll b/llvm/test/CodeGen/AMDGPU/load-local.96.ll index 509aba49893f6..1673a29a5714a 100644 --- a/llvm/test/CodeGen/AMDGPU/load-local.96.ll +++ b/llvm/test/CodeGen/AMDGPU/load-local.96.ll @@ -87,31 +87,34 @@ define <3 x i32> @load_lds_v3i32_align1(ptr addrspace(3) %ptr) { ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: ds_read_u8 v1, v0 offset:1 -; GFX7-NEXT: ds_read_u8 v2, v0 offset:6 -; GFX7-NEXT: ds_read_u8 v4, v0 offset:4 -; GFX7-NEXT: ds_read_u8 v3, v0 offset:2 ; GFX7-NEXT: ds_read_u8 v5, v0 ; GFX7-NEXT: ds_read_u8 v6, v0 offset:3 +; GFX7-NEXT: ds_read_u8 v3, v0 offset:2 ; GFX7-NEXT: ds_read_u8 v7, v0 offset:5 +; GFX7-NEXT: ds_read_u8 v4, v0 offset:4 ; GFX7-NEXT: ds_read_u8 v8, v0 offset:7 +; GFX7-NEXT: ds_read_u8 v2, v0 offset:6 ; GFX7-NEXT: s_waitcnt lgkmcnt(7) ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX7-NEXT: s_waitcnt lgkmcnt(3) +; GFX7-NEXT: s_waitcnt lgkmcnt(6) ; GFX7-NEXT: v_or_b32_e32 v1, v1, v5 -; GFX7-NEXT: s_waitcnt lgkmcnt(2) +; GFX7-NEXT: s_waitcnt lgkmcnt(5) ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v6 +; GFX7-NEXT: s_waitcnt lgkmcnt(4) ; GFX7-NEXT: v_or_b32_e32 v3, v5, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_or_b32_e32 v3, v3, v1 -; GFX7-NEXT: s_waitcnt lgkmcnt(1) +; GFX7-NEXT: s_waitcnt lgkmcnt(3) ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v7 ; GFX7-NEXT: ds_read_u8 v5, v0 offset:9 ; GFX7-NEXT: ds_read_u8 v6, v0 offset:11 ; GFX7-NEXT: ds_read_u8 v7, v0 offset:10 ; GFX7-NEXT: ds_read_u8 v0, v0 offset:8 +; GFX7-NEXT: s_waitcnt lgkmcnt(6) ; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 -; GFX7-NEXT: s_waitcnt lgkmcnt(4) +; GFX7-NEXT: s_waitcnt lgkmcnt(5) ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 8, v8 +; GFX7-NEXT: s_waitcnt lgkmcnt(4) ; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_or_b32_e32 v1, v2, v1 @@ -275,20 +278,21 @@ define <3 x i32> @load_lds_v3i32_align2(ptr addrspace(3) %ptr) { ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: ds_read_u16 v1, v0 offset:2 -; GFX7-NEXT: ds_read_u16 v2, v0 offset:8 -; GFX7-NEXT: ds_read_u16 v3, v0 offset:4 ; GFX7-NEXT: ds_read_u16 v4, v0 ; GFX7-NEXT: ds_read_u16 v5, v0 offset:6 +; GFX7-NEXT: ds_read_u16 v3, v0 offset:4 ; GFX7-NEXT: ds_read_u16 v6, v0 offset:10 +; GFX7-NEXT: ds_read_u16 v2, v0 offset:8 ; GFX7-NEXT: s_waitcnt lgkmcnt(5) ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX7-NEXT: s_waitcnt lgkmcnt(2) -; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX7-NEXT: s_waitcnt lgkmcnt(1) +; GFX7-NEXT: s_waitcnt lgkmcnt(3) ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; GFX7-NEXT: s_waitcnt lgkmcnt(2) ; GFX7-NEXT: v_or_b32_e32 v1, v1, v3 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(1) ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/max.i16.ll b/llvm/test/CodeGen/AMDGPU/max.i16.ll index a5b64f6f80d9b..005fb07d82198 100644 --- a/llvm/test/CodeGen/AMDGPU/max.i16.ll +++ b/llvm/test/CodeGen/AMDGPU/max.i16.ll @@ -116,9 +116,9 @@ define amdgpu_kernel void @v_test_imax_sge_v3i16(ptr addrspace(1) %out, ptr addr ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: v_add_u32_e32 v4, vcc, 4, v0 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v7, v[2:3] ; VI-NEXT: flat_load_ushort v4, v[4:5] ; VI-NEXT: flat_load_dword v5, v[0:1] -; VI-NEXT: flat_load_dword v7, v[2:3] ; VI-NEXT: v_add_u32_e32 v0, vcc, 4, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc ; VI-NEXT: flat_load_ushort v8, v[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll b/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll index 5b7c36559a366..2448f3e3025e8 100644 --- a/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll +++ b/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll @@ -186,14 +186,14 @@ define amdgpu_kernel void @memcpy_p0_p5_minsize(ptr %generic, ptr addrspace(5) % ; CHECK-NEXT: buffer_load_dword v14, v26, s[20:23], 0 offen offset:72 ; CHECK-NEXT: buffer_load_dword v13, v26, s[20:23], 0 offen offset:68 ; CHECK-NEXT: buffer_load_dword v12, v26, s[20:23], 0 offen offset:64 -; CHECK-NEXT: buffer_load_dword v16, v26, s[20:23], 0 offen offset:32 -; CHECK-NEXT: buffer_load_dword v17, v26, s[20:23], 0 offen offset:36 -; CHECK-NEXT: buffer_load_dword v18, v26, s[20:23], 0 offen offset:40 -; CHECK-NEXT: buffer_load_dword v19, v26, s[20:23], 0 offen offset:44 ; CHECK-NEXT: buffer_load_dword v20, v26, s[20:23], 0 offen offset:48 ; CHECK-NEXT: buffer_load_dword v21, v26, s[20:23], 0 offen offset:52 ; CHECK-NEXT: buffer_load_dword v22, v26, s[20:23], 0 offen offset:56 ; CHECK-NEXT: buffer_load_dword v23, v26, s[20:23], 0 offen offset:60 +; CHECK-NEXT: buffer_load_dword v16, v26, s[20:23], 0 offen offset:32 +; CHECK-NEXT: buffer_load_dword v17, v26, s[20:23], 0 offen offset:36 +; CHECK-NEXT: buffer_load_dword v18, v26, s[20:23], 0 offen offset:40 +; CHECK-NEXT: buffer_load_dword v19, v26, s[20:23], 0 offen offset:44 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v25, s1 ; CHECK-NEXT: v_mov_b32_e32 v24, s0 @@ -201,14 +201,14 @@ define amdgpu_kernel void @memcpy_p0_p5_minsize(ptr %generic, ptr addrspace(5) % ; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[0:3] offset:112 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[4:7] offset:96 -; CHECK-NEXT: buffer_load_dword v0, v26, s[20:23], 0 offen -; CHECK-NEXT: buffer_load_dword v1, v26, s[20:23], 0 offen offset:4 -; CHECK-NEXT: buffer_load_dword v2, v26, s[20:23], 0 offen offset:8 -; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: buffer_load_dword v4, v26, s[20:23], 0 offen offset:16 +; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: buffer_load_dword v5, v26, s[20:23], 0 offen offset:20 ; CHECK-NEXT: buffer_load_dword v6, v26, s[20:23], 0 offen offset:24 ; CHECK-NEXT: buffer_load_dword v7, v26, s[20:23], 0 offen offset:28 +; CHECK-NEXT: buffer_load_dword v0, v26, s[20:23], 0 offen +; CHECK-NEXT: buffer_load_dword v1, v26, s[20:23], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v2, v26, s[20:23], 0 offen offset:8 ; CHECK-NEXT: buffer_load_dword v3, v26, s[20:23], 0 offen offset:12 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[8:11] offset:80 @@ -474,14 +474,14 @@ define amdgpu_kernel void @memcpy_p0_p5_optsize(ptr %generic, ptr addrspace(5) % ; CHECK-NEXT: buffer_load_dword v14, v26, s[20:23], 0 offen offset:72 ; CHECK-NEXT: buffer_load_dword v13, v26, s[20:23], 0 offen offset:68 ; CHECK-NEXT: buffer_load_dword v12, v26, s[20:23], 0 offen offset:64 -; CHECK-NEXT: buffer_load_dword v16, v26, s[20:23], 0 offen offset:32 -; CHECK-NEXT: buffer_load_dword v17, v26, s[20:23], 0 offen offset:36 -; CHECK-NEXT: buffer_load_dword v18, v26, s[20:23], 0 offen offset:40 -; CHECK-NEXT: buffer_load_dword v19, v26, s[20:23], 0 offen offset:44 ; CHECK-NEXT: buffer_load_dword v20, v26, s[20:23], 0 offen offset:48 ; CHECK-NEXT: buffer_load_dword v21, v26, s[20:23], 0 offen offset:52 ; CHECK-NEXT: buffer_load_dword v22, v26, s[20:23], 0 offen offset:56 ; CHECK-NEXT: buffer_load_dword v23, v26, s[20:23], 0 offen offset:60 +; CHECK-NEXT: buffer_load_dword v16, v26, s[20:23], 0 offen offset:32 +; CHECK-NEXT: buffer_load_dword v17, v26, s[20:23], 0 offen offset:36 +; CHECK-NEXT: buffer_load_dword v18, v26, s[20:23], 0 offen offset:40 +; CHECK-NEXT: buffer_load_dword v19, v26, s[20:23], 0 offen offset:44 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v25, s1 ; CHECK-NEXT: v_mov_b32_e32 v24, s0 @@ -489,14 +489,14 @@ define amdgpu_kernel void @memcpy_p0_p5_optsize(ptr %generic, ptr addrspace(5) % ; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[0:3] offset:112 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[4:7] offset:96 -; CHECK-NEXT: buffer_load_dword v0, v26, s[20:23], 0 offen -; CHECK-NEXT: buffer_load_dword v1, v26, s[20:23], 0 offen offset:4 -; CHECK-NEXT: buffer_load_dword v2, v26, s[20:23], 0 offen offset:8 -; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: buffer_load_dword v4, v26, s[20:23], 0 offen offset:16 +; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: buffer_load_dword v5, v26, s[20:23], 0 offen offset:20 ; CHECK-NEXT: buffer_load_dword v6, v26, s[20:23], 0 offen offset:24 ; CHECK-NEXT: buffer_load_dword v7, v26, s[20:23], 0 offen offset:28 +; CHECK-NEXT: buffer_load_dword v0, v26, s[20:23], 0 offen +; CHECK-NEXT: buffer_load_dword v1, v26, s[20:23], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v2, v26, s[20:23], 0 offen offset:8 ; CHECK-NEXT: buffer_load_dword v3, v26, s[20:23], 0 offen offset:12 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[8:11] offset:80 diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-param-combinations.ll b/llvm/test/CodeGen/AMDGPU/memcpy-param-combinations.ll index 048610184368d..111354114856b 100644 --- a/llvm/test/CodeGen/AMDGPU/memcpy-param-combinations.ll +++ b/llvm/test/CodeGen/AMDGPU/memcpy-param-combinations.ll @@ -844,16 +844,17 @@ define void @memcpy_p0_p5_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_clause 0x7 +; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:23 +; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:27 ; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen ; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 ; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 ; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:27 ; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:16 ; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(2) +; CHECK-NEXT: s_waitcnt vmcnt(6) ; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[7:8] offset:23 +; CHECK-NEXT: s_waitcnt vmcnt(2) ; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[9:10] offset:16 @@ -869,15 +870,15 @@ define void @memcpy_p0_p5_sz32_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_clause 0x7 -; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 ; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:16 ; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:20 ; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:24 ; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 ; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(1) +; CHECK-NEXT: s_waitcnt vmcnt(4) ; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[7:10] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] @@ -911,16 +912,17 @@ define void @memcpy_p0_p5_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_clause 0x7 +; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:23 +; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:27 ; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen ; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 ; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 ; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:27 ; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:16 ; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(2) +; CHECK-NEXT: s_waitcnt vmcnt(6) ; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[7:8] offset:23 +; CHECK-NEXT: s_waitcnt vmcnt(2) ; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[9:10] offset:16 @@ -936,15 +938,15 @@ define void @memcpy_p0_p5_sz32_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_clause 0x7 -; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 ; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:16 ; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:20 ; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:24 ; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 ; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(1) +; CHECK-NEXT: s_waitcnt vmcnt(4) ; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[7:10] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] diff --git a/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll b/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll index 02f39e25cb447..3c27adfe7d47b 100644 --- a/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll +++ b/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll @@ -97,6 +97,11 @@ define void @memcpy_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0) ; ALIGNED-NEXT: s_clause 0xf ; ALIGNED-NEXT: flat_load_dwordx4 v[16:19], v[24:25] offset:240 ; ALIGNED-NEXT: flat_load_dwordx4 v[20:23], v[24:25] offset:224 +; ALIGNED-NEXT: flat_load_dwordx4 v[100:103], v[24:25] offset:208 +; ALIGNED-NEXT: flat_load_dwordx4 v[96:99], v[24:25] offset:192 +; ALIGNED-NEXT: flat_load_dwordx4 v[81:84], v[24:25] offset:176 +; ALIGNED-NEXT: flat_load_dwordx4 v[66:69], v[24:25] offset:160 +; ALIGNED-NEXT: flat_load_dwordx4 v[52:55], v[24:25] offset:144 ; ALIGNED-NEXT: flat_load_dwordx4 v[4:7], v[24:25] ; ALIGNED-NEXT: flat_load_dwordx4 v[8:11], v[24:25] offset:16 ; ALIGNED-NEXT: flat_load_dwordx4 v[12:15], v[24:25] offset:32 @@ -106,11 +111,6 @@ define void @memcpy_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0) ; ALIGNED-NEXT: flat_load_dwordx4 v[26:29], v[24:25] offset:96 ; ALIGNED-NEXT: flat_load_dwordx4 v[32:35], v[24:25] offset:112 ; ALIGNED-NEXT: flat_load_dwordx4 v[44:47], v[24:25] offset:128 -; ALIGNED-NEXT: flat_load_dwordx4 v[52:55], v[24:25] offset:144 -; ALIGNED-NEXT: flat_load_dwordx4 v[66:69], v[24:25] offset:160 -; ALIGNED-NEXT: flat_load_dwordx4 v[81:84], v[24:25] offset:176 -; ALIGNED-NEXT: flat_load_dwordx4 v[96:99], v[24:25] offset:192 -; ALIGNED-NEXT: flat_load_dwordx4 v[100:103], v[24:25] offset:208 ; ALIGNED-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) ; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:64 ; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 @@ -160,7 +160,7 @@ define void @memcpy_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0) ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v36 offset:226 ; ALIGNED-NEXT: flat_store_byte v[16:17], v36 offset:224 -; ALIGNED-NEXT: s_waitcnt lgkmcnt(16) +; ALIGNED-NEXT: s_waitcnt lgkmcnt(29) ; ALIGNED-NEXT: buffer_store_dword v100, off, s[0:3], s32 offset:32 ; ALIGNED-NEXT: buffer_store_dword v101, off, s[0:3], s32 offset:36 ; ALIGNED-NEXT: buffer_store_dword v102, off, s[0:3], s32 offset:40 @@ -187,6 +187,7 @@ define void @memcpy_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0) ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v64 offset:210 ; ALIGNED-NEXT: flat_store_byte v[16:17], v64 offset:208 +; ALIGNED-NEXT: s_waitcnt lgkmcnt(36) ; ALIGNED-NEXT: buffer_store_dword v96, off, s[0:3], s32 offset:48 ; ALIGNED-NEXT: buffer_store_dword v97, off, s[0:3], s32 offset:52 ; ALIGNED-NEXT: buffer_store_dword v98, off, s[0:3], s32 offset:56 @@ -208,6 +209,7 @@ define void @memcpy_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0) ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v80 offset:194 ; ALIGNED-NEXT: flat_store_byte v[16:17], v80 offset:192 +; ALIGNED-NEXT: s_waitcnt lgkmcnt(43) ; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:128 ; ALIGNED-NEXT: buffer_store_dword v82, off, s[0:3], s32 offset:132 ; ALIGNED-NEXT: buffer_store_dword v83, off, s[0:3], s32 offset:136 @@ -229,6 +231,7 @@ define void @memcpy_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0) ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v81 offset:178 ; ALIGNED-NEXT: flat_store_byte v[16:17], v81 offset:176 +; ALIGNED-NEXT: s_waitcnt lgkmcnt(50) ; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:144 ; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:148 ; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:152 @@ -250,6 +253,7 @@ define void @memcpy_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0) ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v66 offset:162 ; ALIGNED-NEXT: flat_store_byte v[16:17], v66 offset:160 +; ALIGNED-NEXT: s_waitcnt lgkmcnt(57) ; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:96 ; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:100 ; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:104 @@ -271,6 +275,7 @@ define void @memcpy_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0) ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v52 offset:146 ; ALIGNED-NEXT: flat_store_byte v[16:17], v52 offset:144 +; ALIGNED-NEXT: s_waitcnt lgkmcnt(56) ; ALIGNED-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:112 ; ALIGNED-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:116 ; ALIGNED-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:120 @@ -853,6 +858,11 @@ define void @memcpy_p1_p1_sz2048(ptr addrspace(1) align 1 %dst, ptr addrspace(1) ; ALIGNED-NEXT: s_clause 0xf ; ALIGNED-NEXT: global_load_dwordx4 v[16:19], v[24:25], off offset:240 ; ALIGNED-NEXT: global_load_dwordx4 v[20:23], v[24:25], off offset:224 +; ALIGNED-NEXT: global_load_dwordx4 v[100:103], v[24:25], off offset:208 +; ALIGNED-NEXT: global_load_dwordx4 v[96:99], v[24:25], off offset:192 +; ALIGNED-NEXT: global_load_dwordx4 v[81:84], v[24:25], off offset:176 +; ALIGNED-NEXT: global_load_dwordx4 v[66:69], v[24:25], off offset:160 +; ALIGNED-NEXT: global_load_dwordx4 v[52:55], v[24:25], off offset:144 ; ALIGNED-NEXT: global_load_dwordx4 v[4:7], v[24:25], off ; ALIGNED-NEXT: global_load_dwordx4 v[8:11], v[24:25], off offset:16 ; ALIGNED-NEXT: global_load_dwordx4 v[12:15], v[24:25], off offset:32 @@ -862,11 +872,6 @@ define void @memcpy_p1_p1_sz2048(ptr addrspace(1) align 1 %dst, ptr addrspace(1) ; ALIGNED-NEXT: global_load_dwordx4 v[26:29], v[24:25], off offset:96 ; ALIGNED-NEXT: global_load_dwordx4 v[32:35], v[24:25], off offset:112 ; ALIGNED-NEXT: global_load_dwordx4 v[44:47], v[24:25], off offset:128 -; ALIGNED-NEXT: global_load_dwordx4 v[52:55], v[24:25], off offset:144 -; ALIGNED-NEXT: global_load_dwordx4 v[66:69], v[24:25], off offset:160 -; ALIGNED-NEXT: global_load_dwordx4 v[81:84], v[24:25], off offset:176 -; ALIGNED-NEXT: global_load_dwordx4 v[96:99], v[24:25], off offset:192 -; ALIGNED-NEXT: global_load_dwordx4 v[100:103], v[24:25], off offset:208 ; ALIGNED-NEXT: s_waitcnt vmcnt(15) ; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:64 ; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 @@ -2392,6 +2397,7 @@ define void @memcpy_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: .LBB3_1: ; %load-store-loop ; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1 ; ALIGNED-NEXT: s_clause 0x34 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:203 ; ALIGNED-NEXT: buffer_load_ubyte v116, v1, s[0:3], 0 offen offset:255 ; ALIGNED-NEXT: buffer_load_ubyte v117, v1, s[0:3], 0 offen offset:254 ; ALIGNED-NEXT: buffer_load_ubyte v118, v1, s[0:3], 0 offen offset:253 @@ -2444,14 +2450,14 @@ define void @memcpy_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_load_ubyte v125, v1, s[0:3], 0 offen offset:206 ; ALIGNED-NEXT: buffer_load_ubyte v126, v1, s[0:3], 0 offen offset:205 ; ALIGNED-NEXT: buffer_load_ubyte v127, v1, s[0:3], 0 offen offset:204 -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:203 ; ALIGNED-NEXT: s_add_u32 s4, s4, 0x100 ; ALIGNED-NEXT: s_addc_u32 s5, s5, 0 ; ALIGNED-NEXT: v_cmp_gt_u64_e64 s6, 0x800, s[4:5] ; ALIGNED-NEXT: s_and_b32 vcc_lo, exec_lo, s6 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: s_waitcnt vmcnt(52) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x3e +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:129 ; ALIGNED-NEXT: buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:202 ; ALIGNED-NEXT: buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:201 ; ALIGNED-NEXT: buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:200 @@ -2514,8 +2520,8 @@ define void @memcpy_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_load_ubyte v86, v1, s[0:3], 0 offen offset:143 ; ALIGNED-NEXT: buffer_load_ubyte v87, v1, s[0:3], 0 offen offset:142 ; ALIGNED-NEXT: buffer_load_ubyte v96, v1, s[0:3], 0 offen offset:141 -; ALIGNED-NEXT: buffer_load_ubyte v97, v1, s[0:3], 0 offen offset:140 ; ALIGNED-NEXT: s_clause 0xa +; ALIGNED-NEXT: buffer_load_ubyte v97, v1, s[0:3], 0 offen offset:140 ; ALIGNED-NEXT: buffer_load_ubyte v98, v1, s[0:3], 0 offen offset:139 ; ALIGNED-NEXT: buffer_load_ubyte v99, v1, s[0:3], 0 offen offset:138 ; ALIGNED-NEXT: buffer_load_ubyte v100, v1, s[0:3], 0 offen offset:137 @@ -2526,8 +2532,7 @@ define void @memcpy_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_load_ubyte v113, v1, s[0:3], 0 offen offset:132 ; ALIGNED-NEXT: buffer_load_ubyte v114, v1, s[0:3], 0 offen offset:131 ; ALIGNED-NEXT: buffer_load_ubyte v115, v1, s[0:3], 0 offen offset:130 -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:129 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: s_waitcnt vmcnt(62) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:128 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) @@ -3583,35 +3588,11 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; CHECK-NEXT: .LBB4_1: ; %load-store-loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: s_clause 0x3e -; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen offset:32 -; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:36 -; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:40 -; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:44 -; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:48 -; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:52 -; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:56 -; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:60 -; CHECK-NEXT: buffer_load_dword v14, v2, s[0:3], 0 offen offset:76 -; CHECK-NEXT: buffer_load_dword v18, v2, s[0:3], 0 offen offset:92 -; CHECK-NEXT: buffer_load_dword v17, v2, s[0:3], 0 offen offset:88 -; CHECK-NEXT: buffer_load_dword v16, v2, s[0:3], 0 offen offset:84 -; CHECK-NEXT: buffer_load_dword v15, v2, s[0:3], 0 offen offset:80 -; CHECK-NEXT: buffer_load_dword v13, v2, s[0:3], 0 offen offset:72 -; CHECK-NEXT: buffer_load_dword v12, v2, s[0:3], 0 offen offset:68 -; CHECK-NEXT: buffer_load_dword v11, v2, s[0:3], 0 offen offset:64 -; CHECK-NEXT: buffer_load_dword v22, v2, s[0:3], 0 offen offset:108 -; CHECK-NEXT: buffer_load_dword v26, v2, s[0:3], 0 offen offset:124 -; CHECK-NEXT: buffer_load_dword v25, v2, s[0:3], 0 offen offset:120 -; CHECK-NEXT: buffer_load_dword v24, v2, s[0:3], 0 offen offset:116 -; CHECK-NEXT: buffer_load_dword v23, v2, s[0:3], 0 offen offset:112 -; CHECK-NEXT: buffer_load_dword v21, v2, s[0:3], 0 offen offset:104 -; CHECK-NEXT: buffer_load_dword v20, v2, s[0:3], 0 offen offset:100 -; CHECK-NEXT: buffer_load_dword v19, v2, s[0:3], 0 offen offset:96 -; CHECK-NEXT: buffer_load_dword v30, v2, s[0:3], 0 offen offset:236 ; CHECK-NEXT: buffer_load_dword v34, v2, s[0:3], 0 offen offset:252 ; CHECK-NEXT: buffer_load_dword v33, v2, s[0:3], 0 offen offset:248 ; CHECK-NEXT: buffer_load_dword v32, v2, s[0:3], 0 offen offset:244 ; CHECK-NEXT: buffer_load_dword v31, v2, s[0:3], 0 offen offset:240 +; CHECK-NEXT: buffer_load_dword v30, v2, s[0:3], 0 offen offset:236 ; CHECK-NEXT: buffer_load_dword v29, v2, s[0:3], 0 offen offset:232 ; CHECK-NEXT: buffer_load_dword v28, v2, s[0:3], 0 offen offset:228 ; CHECK-NEXT: buffer_load_dword v27, v2, s[0:3], 0 offen offset:224 @@ -3623,11 +3604,11 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; CHECK-NEXT: buffer_load_dword v50, v2, s[0:3], 0 offen offset:200 ; CHECK-NEXT: buffer_load_dword v49, v2, s[0:3], 0 offen offset:196 ; CHECK-NEXT: buffer_load_dword v48, v2, s[0:3], 0 offen offset:192 -; CHECK-NEXT: buffer_load_dword v55, v2, s[0:3], 0 offen offset:172 ; CHECK-NEXT: buffer_load_dword v67, v2, s[0:3], 0 offen offset:188 ; CHECK-NEXT: buffer_load_dword v66, v2, s[0:3], 0 offen offset:184 ; CHECK-NEXT: buffer_load_dword v65, v2, s[0:3], 0 offen offset:180 ; CHECK-NEXT: buffer_load_dword v64, v2, s[0:3], 0 offen offset:176 +; CHECK-NEXT: buffer_load_dword v55, v2, s[0:3], 0 offen offset:172 ; CHECK-NEXT: buffer_load_dword v54, v2, s[0:3], 0 offen offset:168 ; CHECK-NEXT: buffer_load_dword v53, v2, s[0:3], 0 offen offset:164 ; CHECK-NEXT: buffer_load_dword v52, v2, s[0:3], 0 offen offset:160 @@ -3639,13 +3620,37 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; CHECK-NEXT: buffer_load_dword v82, v2, s[0:3], 0 offen offset:136 ; CHECK-NEXT: buffer_load_dword v81, v2, s[0:3], 0 offen offset:132 ; CHECK-NEXT: buffer_load_dword v80, v2, s[0:3], 0 offen offset:128 -; CHECK-NEXT: buffer_load_dword v84, v2, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_dword v85, v2, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_dword v86, v2, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v26, v2, s[0:3], 0 offen offset:124 +; CHECK-NEXT: buffer_load_dword v25, v2, s[0:3], 0 offen offset:120 +; CHECK-NEXT: buffer_load_dword v24, v2, s[0:3], 0 offen offset:116 +; CHECK-NEXT: buffer_load_dword v23, v2, s[0:3], 0 offen offset:112 +; CHECK-NEXT: buffer_load_dword v22, v2, s[0:3], 0 offen offset:108 +; CHECK-NEXT: buffer_load_dword v21, v2, s[0:3], 0 offen offset:104 +; CHECK-NEXT: buffer_load_dword v20, v2, s[0:3], 0 offen offset:100 +; CHECK-NEXT: buffer_load_dword v19, v2, s[0:3], 0 offen offset:96 +; CHECK-NEXT: buffer_load_dword v18, v2, s[0:3], 0 offen offset:92 +; CHECK-NEXT: buffer_load_dword v17, v2, s[0:3], 0 offen offset:88 +; CHECK-NEXT: buffer_load_dword v16, v2, s[0:3], 0 offen offset:84 +; CHECK-NEXT: buffer_load_dword v15, v2, s[0:3], 0 offen offset:80 +; CHECK-NEXT: buffer_load_dword v14, v2, s[0:3], 0 offen offset:76 +; CHECK-NEXT: buffer_load_dword v13, v2, s[0:3], 0 offen offset:72 +; CHECK-NEXT: buffer_load_dword v12, v2, s[0:3], 0 offen offset:68 +; CHECK-NEXT: buffer_load_dword v11, v2, s[0:3], 0 offen offset:64 +; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:48 +; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:52 +; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:56 +; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:60 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen offset:32 +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:36 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:40 +; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:44 ; CHECK-NEXT: buffer_load_dword v96, v2, s[0:3], 0 offen offset:16 ; CHECK-NEXT: buffer_load_dword v97, v2, s[0:3], 0 offen offset:20 ; CHECK-NEXT: buffer_load_dword v98, v2, s[0:3], 0 offen offset:24 ; CHECK-NEXT: buffer_load_dword v99, v2, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_load_dword v84, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v85, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v86, v2, s[0:3], 0 offen offset:8 ; CHECK-NEXT: buffer_load_dword v87, v2, s[0:3], 0 offen offset:12 ; CHECK-NEXT: v_add_co_u32 v100, vcc_lo, v0, s4 ; CHECK-NEXT: s_add_u32 s4, s4, 0x100 @@ -3653,29 +3658,35 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; CHECK-NEXT: s_addc_u32 s5, s5, 0 ; CHECK-NEXT: v_add_nc_u32_e32 v2, 0x100, v2 ; CHECK-NEXT: v_cmp_gt_u64_e64 s6, 0x800, s[4:5] -; CHECK-NEXT: s_waitcnt vmcnt(35) +; CHECK-NEXT: s_waitcnt vmcnt(60) ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[31:34] offset:240 -; CHECK-NEXT: s_waitcnt vmcnt(32) +; CHECK-NEXT: s_waitcnt vmcnt(56) ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[27:30] offset:224 -; CHECK-NEXT: s_waitcnt vmcnt(28) +; CHECK-NEXT: s_waitcnt vmcnt(52) ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[35:38] offset:208 -; CHECK-NEXT: s_waitcnt vmcnt(24) +; CHECK-NEXT: s_waitcnt vmcnt(48) ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[48:51] offset:192 -; CHECK-NEXT: s_waitcnt vmcnt(19) +; CHECK-NEXT: s_waitcnt vmcnt(44) ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[64:67] offset:176 -; CHECK-NEXT: s_waitcnt vmcnt(16) +; CHECK-NEXT: s_waitcnt vmcnt(40) ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[52:55] offset:160 -; CHECK-NEXT: s_waitcnt vmcnt(12) +; CHECK-NEXT: s_waitcnt vmcnt(36) ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[68:71] offset:144 -; CHECK-NEXT: s_waitcnt vmcnt(8) +; CHECK-NEXT: s_waitcnt vmcnt(32) ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[80:83] offset:128 +; CHECK-NEXT: s_waitcnt vmcnt(28) ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[23:26] offset:112 +; CHECK-NEXT: s_waitcnt vmcnt(24) ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[19:22] offset:96 +; CHECK-NEXT: s_waitcnt vmcnt(20) ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[15:18] offset:80 +; CHECK-NEXT: s_waitcnt vmcnt(16) ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[11:14] offset:64 +; CHECK-NEXT: s_waitcnt vmcnt(12) ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[7:10] offset:48 +; CHECK-NEXT: s_waitcnt vmcnt(8) ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[3:6] offset:32 -; CHECK-NEXT: s_waitcnt vmcnt(1) +; CHECK-NEXT: s_waitcnt vmcnt(4) ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[96:99] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[84:87] @@ -5336,14 +5347,14 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; UNROLL3-NEXT: .LBB4_1: ; %load-store-loop ; UNROLL3-NEXT: ; =>This Inner Loop Header: Depth=1 ; UNROLL3-NEXT: s_clause 0xb -; UNROLL3-NEXT: buffer_load_dword v4, v3, s[0:3], 0 offen -; UNROLL3-NEXT: buffer_load_dword v5, v3, s[0:3], 0 offen offset:4 -; UNROLL3-NEXT: buffer_load_dword v6, v3, s[0:3], 0 offen offset:8 -; UNROLL3-NEXT: buffer_load_dword v7, v3, s[0:3], 0 offen offset:12 ; UNROLL3-NEXT: buffer_load_dword v8, v3, s[0:3], 0 offen offset:16 ; UNROLL3-NEXT: buffer_load_dword v9, v3, s[0:3], 0 offen offset:20 ; UNROLL3-NEXT: buffer_load_dword v10, v3, s[0:3], 0 offen offset:24 ; UNROLL3-NEXT: buffer_load_dword v11, v3, s[0:3], 0 offen offset:28 +; UNROLL3-NEXT: buffer_load_dword v4, v3, s[0:3], 0 offen +; UNROLL3-NEXT: buffer_load_dword v5, v3, s[0:3], 0 offen offset:4 +; UNROLL3-NEXT: buffer_load_dword v6, v3, s[0:3], 0 offen offset:8 +; UNROLL3-NEXT: buffer_load_dword v7, v3, s[0:3], 0 offen offset:12 ; UNROLL3-NEXT: buffer_load_dword v12, v3, s[0:3], 0 offen offset:32 ; UNROLL3-NEXT: buffer_load_dword v13, v3, s[0:3], 0 offen offset:36 ; UNROLL3-NEXT: buffer_load_dword v14, v3, s[0:3], 0 offen offset:40 @@ -5354,8 +5365,9 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; UNROLL3-NEXT: s_addc_u32 s5, s5, 0 ; UNROLL3-NEXT: v_add_nc_u32_e32 v3, 48, v3 ; UNROLL3-NEXT: v_cmp_gt_u64_e64 s6, 0x7e0, s[4:5] -; UNROLL3-NEXT: s_waitcnt vmcnt(4) +; UNROLL3-NEXT: s_waitcnt vmcnt(8) ; UNROLL3-NEXT: flat_store_dwordx4 v[16:17], v[8:11] offset:16 +; UNROLL3-NEXT: s_waitcnt vmcnt(4) ; UNROLL3-NEXT: flat_store_dwordx4 v[16:17], v[4:7] ; UNROLL3-NEXT: s_waitcnt vmcnt(0) ; UNROLL3-NEXT: flat_store_dwordx4 v[16:17], v[12:15] offset:32 @@ -5551,6 +5563,11 @@ define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0 ; ALIGNED-NEXT: s_clause 0xf ; ALIGNED-NEXT: flat_load_dwordx4 v[16:19], v[20:21] offset:240 ; ALIGNED-NEXT: flat_load_dwordx4 v[22:25], v[20:21] offset:224 +; ALIGNED-NEXT: flat_load_dwordx4 v[26:29], v[20:21] offset:208 +; ALIGNED-NEXT: flat_load_dwordx4 v[30:33], v[20:21] offset:192 +; ALIGNED-NEXT: flat_load_dwordx4 v[34:37], v[20:21] offset:176 +; ALIGNED-NEXT: flat_load_dwordx4 v[44:47], v[20:21] offset:160 +; ALIGNED-NEXT: flat_load_dwordx4 v[50:53], v[20:21] offset:144 ; ALIGNED-NEXT: flat_load_dwordx4 v[4:7], v[20:21] ; ALIGNED-NEXT: flat_load_dwordx4 v[8:11], v[20:21] offset:16 ; ALIGNED-NEXT: flat_load_dwordx4 v[12:15], v[20:21] offset:32 @@ -5560,11 +5577,6 @@ define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0 ; ALIGNED-NEXT: flat_load_dwordx4 v[116:119], v[20:21] offset:96 ; ALIGNED-NEXT: flat_load_dwordx4 v[66:69], v[20:21] offset:112 ; ALIGNED-NEXT: flat_load_dwordx4 v[40:43], v[20:21] offset:128 -; ALIGNED-NEXT: flat_load_dwordx4 v[50:53], v[20:21] offset:144 -; ALIGNED-NEXT: flat_load_dwordx4 v[44:47], v[20:21] offset:160 -; ALIGNED-NEXT: flat_load_dwordx4 v[34:37], v[20:21] offset:176 -; ALIGNED-NEXT: flat_load_dwordx4 v[30:33], v[20:21] offset:192 -; ALIGNED-NEXT: flat_load_dwordx4 v[26:29], v[20:21] offset:208 ; ALIGNED-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) ; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:64 ; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 @@ -5618,7 +5630,7 @@ define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v22 offset:226 ; ALIGNED-NEXT: flat_store_byte v[16:17], v22 offset:224 -; ALIGNED-NEXT: s_waitcnt lgkmcnt(16) +; ALIGNED-NEXT: s_waitcnt lgkmcnt(29) ; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:32 ; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:36 ; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:40 @@ -5640,6 +5652,7 @@ define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v26 offset:210 ; ALIGNED-NEXT: flat_store_byte v[16:17], v26 offset:208 +; ALIGNED-NEXT: s_waitcnt lgkmcnt(36) ; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:48 ; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:52 ; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:56 @@ -5661,6 +5674,7 @@ define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v30 offset:194 ; ALIGNED-NEXT: flat_store_byte v[16:17], v30 offset:192 +; ALIGNED-NEXT: s_waitcnt lgkmcnt(43) ; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:128 ; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:132 ; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:136 @@ -5682,6 +5696,7 @@ define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v34 offset:178 ; ALIGNED-NEXT: flat_store_byte v[16:17], v34 offset:176 +; ALIGNED-NEXT: s_waitcnt lgkmcnt(50) ; ALIGNED-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:144 ; ALIGNED-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:148 ; ALIGNED-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:152 @@ -5703,6 +5718,7 @@ define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v38 offset:162 ; ALIGNED-NEXT: flat_store_byte v[16:17], v38 offset:160 +; ALIGNED-NEXT: s_waitcnt lgkmcnt(57) ; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:96 ; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:100 ; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:104 @@ -5724,6 +5740,7 @@ define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v50 offset:146 ; ALIGNED-NEXT: flat_store_byte v[16:17], v50 offset:144 +; ALIGNED-NEXT: s_waitcnt lgkmcnt(56) ; ALIGNED-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:112 ; ALIGNED-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:116 ; ALIGNED-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:120 @@ -6180,6 +6197,11 @@ define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0 ; ALIGNED-NEXT: s_clause 0xf ; ALIGNED-NEXT: flat_load_dwordx4 v[16:19], v[24:25] offset:240 ; ALIGNED-NEXT: flat_load_dwordx4 v[20:23], v[24:25] offset:224 +; ALIGNED-NEXT: flat_load_dwordx4 v[100:103], v[24:25] offset:208 +; ALIGNED-NEXT: flat_load_dwordx4 v[96:99], v[24:25] offset:192 +; ALIGNED-NEXT: flat_load_dwordx4 v[81:84], v[24:25] offset:176 +; ALIGNED-NEXT: flat_load_dwordx4 v[66:69], v[24:25] offset:160 +; ALIGNED-NEXT: flat_load_dwordx4 v[52:55], v[24:25] offset:144 ; ALIGNED-NEXT: flat_load_dwordx4 v[4:7], v[24:25] ; ALIGNED-NEXT: flat_load_dwordx4 v[8:11], v[24:25] offset:16 ; ALIGNED-NEXT: flat_load_dwordx4 v[12:15], v[24:25] offset:32 @@ -6189,11 +6211,6 @@ define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0 ; ALIGNED-NEXT: flat_load_dwordx4 v[26:29], v[24:25] offset:96 ; ALIGNED-NEXT: flat_load_dwordx4 v[32:35], v[24:25] offset:112 ; ALIGNED-NEXT: flat_load_dwordx4 v[44:47], v[24:25] offset:128 -; ALIGNED-NEXT: flat_load_dwordx4 v[52:55], v[24:25] offset:144 -; ALIGNED-NEXT: flat_load_dwordx4 v[66:69], v[24:25] offset:160 -; ALIGNED-NEXT: flat_load_dwordx4 v[81:84], v[24:25] offset:176 -; ALIGNED-NEXT: flat_load_dwordx4 v[96:99], v[24:25] offset:192 -; ALIGNED-NEXT: flat_load_dwordx4 v[100:103], v[24:25] offset:208 ; ALIGNED-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) ; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:320 ; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:324 @@ -6243,7 +6260,7 @@ define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v36 offset:226 ; ALIGNED-NEXT: flat_store_byte v[16:17], v36 offset:224 -; ALIGNED-NEXT: s_waitcnt lgkmcnt(16) +; ALIGNED-NEXT: s_waitcnt lgkmcnt(29) ; ALIGNED-NEXT: buffer_store_dword v100, off, s[0:3], s32 offset:288 ; ALIGNED-NEXT: buffer_store_dword v101, off, s[0:3], s32 offset:292 ; ALIGNED-NEXT: buffer_store_dword v102, off, s[0:3], s32 offset:296 @@ -6269,6 +6286,7 @@ define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v64 offset:210 ; ALIGNED-NEXT: flat_store_byte v[16:17], v64 offset:208 +; ALIGNED-NEXT: s_waitcnt lgkmcnt(36) ; ALIGNED-NEXT: buffer_store_dword v96, off, s[0:3], s32 offset:304 ; ALIGNED-NEXT: buffer_store_dword v97, off, s[0:3], s32 offset:308 ; ALIGNED-NEXT: buffer_store_dword v98, off, s[0:3], s32 offset:312 @@ -6290,6 +6308,7 @@ define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v80 offset:194 ; ALIGNED-NEXT: flat_store_byte v[16:17], v80 offset:192 +; ALIGNED-NEXT: s_waitcnt lgkmcnt(43) ; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:384 ; ALIGNED-NEXT: buffer_store_dword v82, off, s[0:3], s32 offset:388 ; ALIGNED-NEXT: buffer_store_dword v83, off, s[0:3], s32 offset:392 @@ -6311,6 +6330,7 @@ define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v81 offset:178 ; ALIGNED-NEXT: flat_store_byte v[16:17], v81 offset:176 +; ALIGNED-NEXT: s_waitcnt lgkmcnt(50) ; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:400 ; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:404 ; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:408 @@ -6332,6 +6352,7 @@ define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v66 offset:162 ; ALIGNED-NEXT: flat_store_byte v[16:17], v66 offset:160 +; ALIGNED-NEXT: s_waitcnt lgkmcnt(57) ; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:352 ; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:356 ; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:360 @@ -6353,6 +6374,7 @@ define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v52 offset:146 ; ALIGNED-NEXT: flat_store_byte v[16:17], v52 offset:144 +; ALIGNED-NEXT: s_waitcnt lgkmcnt(56) ; ALIGNED-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:368 ; ALIGNED-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:372 ; ALIGNED-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:376 @@ -7054,6 +7076,11 @@ define void @memmove_p1_p1_sz2048(ptr addrspace(1) align 1 %dst, ptr addrspace(1 ; ALIGNED-NEXT: s_clause 0xf ; ALIGNED-NEXT: global_load_dwordx4 v[16:19], v[20:21], off offset:240 ; ALIGNED-NEXT: global_load_dwordx4 v[22:25], v[20:21], off offset:224 +; ALIGNED-NEXT: global_load_dwordx4 v[26:29], v[20:21], off offset:208 +; ALIGNED-NEXT: global_load_dwordx4 v[30:33], v[20:21], off offset:192 +; ALIGNED-NEXT: global_load_dwordx4 v[34:37], v[20:21], off offset:176 +; ALIGNED-NEXT: global_load_dwordx4 v[44:47], v[20:21], off offset:160 +; ALIGNED-NEXT: global_load_dwordx4 v[50:53], v[20:21], off offset:144 ; ALIGNED-NEXT: global_load_dwordx4 v[4:7], v[20:21], off ; ALIGNED-NEXT: global_load_dwordx4 v[8:11], v[20:21], off offset:16 ; ALIGNED-NEXT: global_load_dwordx4 v[12:15], v[20:21], off offset:32 @@ -7063,11 +7090,6 @@ define void @memmove_p1_p1_sz2048(ptr addrspace(1) align 1 %dst, ptr addrspace(1 ; ALIGNED-NEXT: global_load_dwordx4 v[116:119], v[20:21], off offset:96 ; ALIGNED-NEXT: global_load_dwordx4 v[66:69], v[20:21], off offset:112 ; ALIGNED-NEXT: global_load_dwordx4 v[40:43], v[20:21], off offset:128 -; ALIGNED-NEXT: global_load_dwordx4 v[50:53], v[20:21], off offset:144 -; ALIGNED-NEXT: global_load_dwordx4 v[44:47], v[20:21], off offset:160 -; ALIGNED-NEXT: global_load_dwordx4 v[34:37], v[20:21], off offset:176 -; ALIGNED-NEXT: global_load_dwordx4 v[30:33], v[20:21], off offset:192 -; ALIGNED-NEXT: global_load_dwordx4 v[26:29], v[20:21], off offset:208 ; ALIGNED-NEXT: s_waitcnt vmcnt(15) ; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:64 ; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 @@ -7681,6 +7703,11 @@ define void @memmove_p1_p1_sz2048(ptr addrspace(1) align 1 %dst, ptr addrspace(1 ; ALIGNED-NEXT: s_clause 0xf ; ALIGNED-NEXT: global_load_dwordx4 v[16:19], v[24:25], off offset:240 ; ALIGNED-NEXT: global_load_dwordx4 v[20:23], v[24:25], off offset:224 +; ALIGNED-NEXT: global_load_dwordx4 v[100:103], v[24:25], off offset:208 +; ALIGNED-NEXT: global_load_dwordx4 v[96:99], v[24:25], off offset:192 +; ALIGNED-NEXT: global_load_dwordx4 v[81:84], v[24:25], off offset:176 +; ALIGNED-NEXT: global_load_dwordx4 v[66:69], v[24:25], off offset:160 +; ALIGNED-NEXT: global_load_dwordx4 v[52:55], v[24:25], off offset:144 ; ALIGNED-NEXT: global_load_dwordx4 v[4:7], v[24:25], off ; ALIGNED-NEXT: global_load_dwordx4 v[8:11], v[24:25], off offset:16 ; ALIGNED-NEXT: global_load_dwordx4 v[12:15], v[24:25], off offset:32 @@ -7690,11 +7717,6 @@ define void @memmove_p1_p1_sz2048(ptr addrspace(1) align 1 %dst, ptr addrspace(1 ; ALIGNED-NEXT: global_load_dwordx4 v[26:29], v[24:25], off offset:96 ; ALIGNED-NEXT: global_load_dwordx4 v[32:35], v[24:25], off offset:112 ; ALIGNED-NEXT: global_load_dwordx4 v[44:47], v[24:25], off offset:128 -; ALIGNED-NEXT: global_load_dwordx4 v[52:55], v[24:25], off offset:144 -; ALIGNED-NEXT: global_load_dwordx4 v[66:69], v[24:25], off offset:160 -; ALIGNED-NEXT: global_load_dwordx4 v[81:84], v[24:25], off offset:176 -; ALIGNED-NEXT: global_load_dwordx4 v[96:99], v[24:25], off offset:192 -; ALIGNED-NEXT: global_load_dwordx4 v[100:103], v[24:25], off offset:208 ; ALIGNED-NEXT: s_waitcnt vmcnt(15) ; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:320 ; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:324 @@ -10124,6 +10146,7 @@ define void @memmove_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x3e +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:126 ; ALIGNED-NEXT: buffer_load_ubyte v123, v1, s[0:3], 0 offen offset:247 ; ALIGNED-NEXT: buffer_load_ubyte v122, v1, s[0:3], 0 offen offset:246 ; ALIGNED-NEXT: buffer_load_ubyte v121, v1, s[0:3], 0 offen offset:245 @@ -10186,8 +10209,8 @@ define void @memmove_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_load_ubyte v96, v1, s[0:3], 0 offen offset:188 ; ALIGNED-NEXT: buffer_load_ubyte v87, v1, s[0:3], 0 offen offset:187 ; ALIGNED-NEXT: buffer_load_ubyte v86, v1, s[0:3], 0 offen offset:186 -; ALIGNED-NEXT: buffer_load_ubyte v85, v1, s[0:3], 0 offen offset:185 ; ALIGNED-NEXT: s_clause 0x3a +; ALIGNED-NEXT: buffer_load_ubyte v85, v1, s[0:3], 0 offen offset:185 ; ALIGNED-NEXT: buffer_load_ubyte v84, v1, s[0:3], 0 offen offset:184 ; ALIGNED-NEXT: buffer_load_ubyte v83, v1, s[0:3], 0 offen offset:183 ; ALIGNED-NEXT: buffer_load_ubyte v82, v1, s[0:3], 0 offen offset:182 @@ -10246,16 +10269,15 @@ define void @memmove_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:129 ; ALIGNED-NEXT: buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:128 ; ALIGNED-NEXT: buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:127 -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:126 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: s_waitcnt vmcnt(62) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x4 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:121 ; ALIGNED-NEXT: buffer_load_ubyte v127, v1, s[0:3], 0 offen offset:125 ; ALIGNED-NEXT: buffer_load_ubyte v126, v1, s[0:3], 0 offen offset:124 ; ALIGNED-NEXT: buffer_load_ubyte v125, v1, s[0:3], 0 offen offset:123 ; ALIGNED-NEXT: buffer_load_ubyte v124, v1, s[0:3], 0 offen offset:122 -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:121 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: s_waitcnt vmcnt(4) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:120 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) @@ -11323,6 +11345,7 @@ define void @memmove_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x3e +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:126 ; ALIGNED-NEXT: buffer_load_ubyte v115, v1, s[0:3], 0 offen offset:199 ; ALIGNED-NEXT: buffer_load_ubyte v114, v1, s[0:3], 0 offen offset:198 ; ALIGNED-NEXT: buffer_load_ubyte v113, v1, s[0:3], 0 offen offset:197 @@ -11385,8 +11408,8 @@ define void @memmove_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:140 ; ALIGNED-NEXT: buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:139 ; ALIGNED-NEXT: buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:138 -; ALIGNED-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:137 ; ALIGNED-NEXT: s_clause 0xa +; ALIGNED-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:137 ; ALIGNED-NEXT: buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:136 ; ALIGNED-NEXT: buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:135 ; ALIGNED-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:134 @@ -11397,10 +11420,10 @@ define void @memmove_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:129 ; ALIGNED-NEXT: buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:128 ; ALIGNED-NEXT: buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:127 -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:126 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: s_waitcnt vmcnt(62) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x34 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:73 ; ALIGNED-NEXT: buffer_load_ubyte v127, v1, s[0:3], 0 offen offset:125 ; ALIGNED-NEXT: buffer_load_ubyte v126, v1, s[0:3], 0 offen offset:124 ; ALIGNED-NEXT: buffer_load_ubyte v125, v1, s[0:3], 0 offen offset:123 @@ -11453,8 +11476,7 @@ define void @memmove_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_load_ubyte v118, v1, s[0:3], 0 offen offset:76 ; ALIGNED-NEXT: buffer_load_ubyte v117, v1, s[0:3], 0 offen offset:75 ; ALIGNED-NEXT: buffer_load_ubyte v116, v1, s[0:3], 0 offen offset:74 -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:73 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: s_waitcnt vmcnt(52) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:72 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) @@ -12436,94 +12458,104 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; CHECK-NEXT: .LBB9_1: ; %memmove_fwd_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: s_clause 0x3e -; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen offset:32 -; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:36 -; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:40 -; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:44 -; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:48 -; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:52 -; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:56 -; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:60 -; CHECK-NEXT: buffer_load_dword v14, v2, s[0:3], 0 offen offset:76 -; CHECK-NEXT: buffer_load_dword v18, v2, s[0:3], 0 offen offset:92 -; CHECK-NEXT: buffer_load_dword v17, v2, s[0:3], 0 offen offset:88 -; CHECK-NEXT: buffer_load_dword v16, v2, s[0:3], 0 offen offset:84 -; CHECK-NEXT: buffer_load_dword v15, v2, s[0:3], 0 offen offset:80 -; CHECK-NEXT: buffer_load_dword v13, v2, s[0:3], 0 offen offset:72 -; CHECK-NEXT: buffer_load_dword v12, v2, s[0:3], 0 offen offset:68 -; CHECK-NEXT: buffer_load_dword v11, v2, s[0:3], 0 offen offset:64 -; CHECK-NEXT: buffer_load_dword v22, v2, s[0:3], 0 offen offset:108 -; CHECK-NEXT: buffer_load_dword v26, v2, s[0:3], 0 offen offset:124 -; CHECK-NEXT: buffer_load_dword v25, v2, s[0:3], 0 offen offset:120 -; CHECK-NEXT: buffer_load_dword v24, v2, s[0:3], 0 offen offset:116 -; CHECK-NEXT: buffer_load_dword v23, v2, s[0:3], 0 offen offset:112 -; CHECK-NEXT: buffer_load_dword v21, v2, s[0:3], 0 offen offset:104 -; CHECK-NEXT: buffer_load_dword v20, v2, s[0:3], 0 offen offset:100 -; CHECK-NEXT: buffer_load_dword v19, v2, s[0:3], 0 offen offset:96 -; CHECK-NEXT: buffer_load_dword v30, v2, s[0:3], 0 offen offset:172 -; CHECK-NEXT: buffer_load_dword v34, v2, s[0:3], 0 offen offset:188 -; CHECK-NEXT: buffer_load_dword v33, v2, s[0:3], 0 offen offset:184 -; CHECK-NEXT: buffer_load_dword v32, v2, s[0:3], 0 offen offset:180 -; CHECK-NEXT: buffer_load_dword v31, v2, s[0:3], 0 offen offset:176 -; CHECK-NEXT: buffer_load_dword v29, v2, s[0:3], 0 offen offset:168 -; CHECK-NEXT: buffer_load_dword v28, v2, s[0:3], 0 offen offset:164 -; CHECK-NEXT: buffer_load_dword v27, v2, s[0:3], 0 offen offset:160 -; CHECK-NEXT: buffer_load_dword v38, v2, s[0:3], 0 offen offset:204 -; CHECK-NEXT: buffer_load_dword v51, v2, s[0:3], 0 offen offset:220 -; CHECK-NEXT: buffer_load_dword v50, v2, s[0:3], 0 offen offset:216 -; CHECK-NEXT: buffer_load_dword v49, v2, s[0:3], 0 offen offset:212 -; CHECK-NEXT: buffer_load_dword v48, v2, s[0:3], 0 offen offset:208 -; CHECK-NEXT: buffer_load_dword v37, v2, s[0:3], 0 offen offset:200 -; CHECK-NEXT: buffer_load_dword v36, v2, s[0:3], 0 offen offset:196 -; CHECK-NEXT: buffer_load_dword v35, v2, s[0:3], 0 offen offset:192 -; CHECK-NEXT: buffer_load_dword v55, v2, s[0:3], 0 offen offset:236 ; CHECK-NEXT: buffer_load_dword v67, v2, s[0:3], 0 offen offset:252 ; CHECK-NEXT: buffer_load_dword v66, v2, s[0:3], 0 offen offset:248 ; CHECK-NEXT: buffer_load_dword v65, v2, s[0:3], 0 offen offset:244 ; CHECK-NEXT: buffer_load_dword v64, v2, s[0:3], 0 offen offset:240 +; CHECK-NEXT: buffer_load_dword v55, v2, s[0:3], 0 offen offset:236 ; CHECK-NEXT: buffer_load_dword v54, v2, s[0:3], 0 offen offset:232 ; CHECK-NEXT: buffer_load_dword v53, v2, s[0:3], 0 offen offset:228 ; CHECK-NEXT: buffer_load_dword v52, v2, s[0:3], 0 offen offset:224 -; CHECK-NEXT: buffer_load_dword v71, v2, s[0:3], 0 offen offset:140 +; CHECK-NEXT: buffer_load_dword v51, v2, s[0:3], 0 offen offset:220 +; CHECK-NEXT: buffer_load_dword v50, v2, s[0:3], 0 offen offset:216 +; CHECK-NEXT: buffer_load_dword v49, v2, s[0:3], 0 offen offset:212 +; CHECK-NEXT: buffer_load_dword v48, v2, s[0:3], 0 offen offset:208 +; CHECK-NEXT: buffer_load_dword v38, v2, s[0:3], 0 offen offset:204 +; CHECK-NEXT: buffer_load_dword v37, v2, s[0:3], 0 offen offset:200 +; CHECK-NEXT: buffer_load_dword v36, v2, s[0:3], 0 offen offset:196 +; CHECK-NEXT: buffer_load_dword v35, v2, s[0:3], 0 offen offset:192 +; CHECK-NEXT: buffer_load_dword v34, v2, s[0:3], 0 offen offset:188 +; CHECK-NEXT: buffer_load_dword v33, v2, s[0:3], 0 offen offset:184 +; CHECK-NEXT: buffer_load_dword v32, v2, s[0:3], 0 offen offset:180 +; CHECK-NEXT: buffer_load_dword v31, v2, s[0:3], 0 offen offset:176 +; CHECK-NEXT: buffer_load_dword v30, v2, s[0:3], 0 offen offset:172 +; CHECK-NEXT: buffer_load_dword v29, v2, s[0:3], 0 offen offset:168 +; CHECK-NEXT: buffer_load_dword v28, v2, s[0:3], 0 offen offset:164 +; CHECK-NEXT: buffer_load_dword v27, v2, s[0:3], 0 offen offset:160 ; CHECK-NEXT: buffer_load_dword v83, v2, s[0:3], 0 offen offset:156 ; CHECK-NEXT: buffer_load_dword v82, v2, s[0:3], 0 offen offset:152 ; CHECK-NEXT: buffer_load_dword v81, v2, s[0:3], 0 offen offset:148 ; CHECK-NEXT: buffer_load_dword v80, v2, s[0:3], 0 offen offset:144 +; CHECK-NEXT: buffer_load_dword v71, v2, s[0:3], 0 offen offset:140 ; CHECK-NEXT: buffer_load_dword v70, v2, s[0:3], 0 offen offset:136 ; CHECK-NEXT: buffer_load_dword v69, v2, s[0:3], 0 offen offset:132 ; CHECK-NEXT: buffer_load_dword v68, v2, s[0:3], 0 offen offset:128 -; CHECK-NEXT: buffer_load_dword v84, v2, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_dword v85, v2, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_dword v86, v2, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v26, v2, s[0:3], 0 offen offset:124 +; CHECK-NEXT: buffer_load_dword v25, v2, s[0:3], 0 offen offset:120 +; CHECK-NEXT: buffer_load_dword v24, v2, s[0:3], 0 offen offset:116 +; CHECK-NEXT: buffer_load_dword v23, v2, s[0:3], 0 offen offset:112 +; CHECK-NEXT: buffer_load_dword v22, v2, s[0:3], 0 offen offset:108 +; CHECK-NEXT: buffer_load_dword v21, v2, s[0:3], 0 offen offset:104 +; CHECK-NEXT: buffer_load_dword v20, v2, s[0:3], 0 offen offset:100 +; CHECK-NEXT: buffer_load_dword v19, v2, s[0:3], 0 offen offset:96 +; CHECK-NEXT: buffer_load_dword v18, v2, s[0:3], 0 offen offset:92 +; CHECK-NEXT: buffer_load_dword v17, v2, s[0:3], 0 offen offset:88 +; CHECK-NEXT: buffer_load_dword v16, v2, s[0:3], 0 offen offset:84 +; CHECK-NEXT: buffer_load_dword v15, v2, s[0:3], 0 offen offset:80 +; CHECK-NEXT: buffer_load_dword v14, v2, s[0:3], 0 offen offset:76 +; CHECK-NEXT: buffer_load_dword v13, v2, s[0:3], 0 offen offset:72 +; CHECK-NEXT: buffer_load_dword v12, v2, s[0:3], 0 offen offset:68 +; CHECK-NEXT: buffer_load_dword v11, v2, s[0:3], 0 offen offset:64 +; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:48 +; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:52 +; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:56 +; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:60 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen offset:32 +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:36 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:40 +; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:44 ; CHECK-NEXT: buffer_load_dword v96, v2, s[0:3], 0 offen offset:16 ; CHECK-NEXT: buffer_load_dword v97, v2, s[0:3], 0 offen offset:20 ; CHECK-NEXT: buffer_load_dword v98, v2, s[0:3], 0 offen offset:24 ; CHECK-NEXT: buffer_load_dword v99, v2, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_load_dword v84, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v85, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v86, v2, s[0:3], 0 offen offset:8 ; CHECK-NEXT: buffer_load_dword v87, v2, s[0:3], 0 offen offset:12 ; CHECK-NEXT: v_add_co_u32 v100, vcc_lo, v0, s4 ; CHECK-NEXT: v_add_co_ci_u32_e64 v101, null, s5, v1, vcc_lo ; CHECK-NEXT: s_add_u32 s4, s4, 0x100 ; CHECK-NEXT: v_add_nc_u32_e32 v2, 0x100, v2 ; CHECK-NEXT: s_addc_u32 s5, s5, 0 -; CHECK-NEXT: s_waitcnt vmcnt(19) +; CHECK-NEXT: s_waitcnt vmcnt(60) ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[64:67] offset:240 -; CHECK-NEXT: s_waitcnt vmcnt(16) +; CHECK-NEXT: s_waitcnt vmcnt(56) ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[52:55] offset:224 +; CHECK-NEXT: s_waitcnt vmcnt(52) ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[48:51] offset:208 +; CHECK-NEXT: s_waitcnt vmcnt(48) ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[35:38] offset:192 +; CHECK-NEXT: s_waitcnt vmcnt(44) ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[31:34] offset:176 +; CHECK-NEXT: s_waitcnt vmcnt(40) ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[27:30] offset:160 -; CHECK-NEXT: s_waitcnt vmcnt(11) +; CHECK-NEXT: s_waitcnt vmcnt(36) ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[80:83] offset:144 -; CHECK-NEXT: s_waitcnt vmcnt(8) +; CHECK-NEXT: s_waitcnt vmcnt(32) ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[68:71] offset:128 +; CHECK-NEXT: s_waitcnt vmcnt(28) ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[23:26] offset:112 +; CHECK-NEXT: s_waitcnt vmcnt(24) ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[19:22] offset:96 +; CHECK-NEXT: s_waitcnt vmcnt(20) ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[15:18] offset:80 +; CHECK-NEXT: s_waitcnt vmcnt(16) ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[11:14] offset:64 +; CHECK-NEXT: s_waitcnt vmcnt(12) ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[7:10] offset:48 +; CHECK-NEXT: s_waitcnt vmcnt(8) ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[3:6] offset:32 -; CHECK-NEXT: s_waitcnt vmcnt(1) +; CHECK-NEXT: s_waitcnt vmcnt(4) ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[96:99] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[84:87] @@ -12540,51 +12572,27 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; CHECK-NEXT: .LBB9_4: ; %memmove_bwd_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: s_clause 0x3e -; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen offset:32 -; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:36 -; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:40 -; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:44 -; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:48 -; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:52 -; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:56 -; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:60 -; CHECK-NEXT: buffer_load_dword v14, v2, s[0:3], 0 offen offset:76 -; CHECK-NEXT: buffer_load_dword v18, v2, s[0:3], 0 offen offset:92 -; CHECK-NEXT: buffer_load_dword v17, v2, s[0:3], 0 offen offset:88 -; CHECK-NEXT: buffer_load_dword v16, v2, s[0:3], 0 offen offset:84 -; CHECK-NEXT: buffer_load_dword v15, v2, s[0:3], 0 offen offset:80 -; CHECK-NEXT: buffer_load_dword v13, v2, s[0:3], 0 offen offset:72 -; CHECK-NEXT: buffer_load_dword v12, v2, s[0:3], 0 offen offset:68 -; CHECK-NEXT: buffer_load_dword v11, v2, s[0:3], 0 offen offset:64 -; CHECK-NEXT: buffer_load_dword v22, v2, s[0:3], 0 offen offset:108 -; CHECK-NEXT: buffer_load_dword v26, v2, s[0:3], 0 offen offset:124 -; CHECK-NEXT: buffer_load_dword v25, v2, s[0:3], 0 offen offset:120 -; CHECK-NEXT: buffer_load_dword v24, v2, s[0:3], 0 offen offset:116 -; CHECK-NEXT: buffer_load_dword v23, v2, s[0:3], 0 offen offset:112 -; CHECK-NEXT: buffer_load_dword v21, v2, s[0:3], 0 offen offset:104 -; CHECK-NEXT: buffer_load_dword v20, v2, s[0:3], 0 offen offset:100 -; CHECK-NEXT: buffer_load_dword v19, v2, s[0:3], 0 offen offset:96 -; CHECK-NEXT: buffer_load_dword v30, v2, s[0:3], 0 offen offset:236 ; CHECK-NEXT: buffer_load_dword v34, v2, s[0:3], 0 offen offset:252 ; CHECK-NEXT: buffer_load_dword v33, v2, s[0:3], 0 offen offset:248 ; CHECK-NEXT: buffer_load_dword v32, v2, s[0:3], 0 offen offset:244 ; CHECK-NEXT: buffer_load_dword v31, v2, s[0:3], 0 offen offset:240 +; CHECK-NEXT: buffer_load_dword v30, v2, s[0:3], 0 offen offset:236 ; CHECK-NEXT: buffer_load_dword v29, v2, s[0:3], 0 offen offset:232 ; CHECK-NEXT: buffer_load_dword v28, v2, s[0:3], 0 offen offset:228 ; CHECK-NEXT: buffer_load_dword v27, v2, s[0:3], 0 offen offset:224 -; CHECK-NEXT: buffer_load_dword v38, v2, s[0:3], 0 offen offset:204 ; CHECK-NEXT: buffer_load_dword v51, v2, s[0:3], 0 offen offset:220 ; CHECK-NEXT: buffer_load_dword v50, v2, s[0:3], 0 offen offset:216 ; CHECK-NEXT: buffer_load_dword v49, v2, s[0:3], 0 offen offset:212 ; CHECK-NEXT: buffer_load_dword v48, v2, s[0:3], 0 offen offset:208 +; CHECK-NEXT: buffer_load_dword v38, v2, s[0:3], 0 offen offset:204 ; CHECK-NEXT: buffer_load_dword v37, v2, s[0:3], 0 offen offset:200 ; CHECK-NEXT: buffer_load_dword v36, v2, s[0:3], 0 offen offset:196 ; CHECK-NEXT: buffer_load_dword v35, v2, s[0:3], 0 offen offset:192 -; CHECK-NEXT: buffer_load_dword v55, v2, s[0:3], 0 offen offset:172 ; CHECK-NEXT: buffer_load_dword v67, v2, s[0:3], 0 offen offset:188 ; CHECK-NEXT: buffer_load_dword v66, v2, s[0:3], 0 offen offset:184 ; CHECK-NEXT: buffer_load_dword v65, v2, s[0:3], 0 offen offset:180 ; CHECK-NEXT: buffer_load_dword v64, v2, s[0:3], 0 offen offset:176 +; CHECK-NEXT: buffer_load_dword v55, v2, s[0:3], 0 offen offset:172 ; CHECK-NEXT: buffer_load_dword v54, v2, s[0:3], 0 offen offset:168 ; CHECK-NEXT: buffer_load_dword v53, v2, s[0:3], 0 offen offset:164 ; CHECK-NEXT: buffer_load_dword v52, v2, s[0:3], 0 offen offset:160 @@ -12596,42 +12604,72 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; CHECK-NEXT: buffer_load_dword v82, v2, s[0:3], 0 offen offset:136 ; CHECK-NEXT: buffer_load_dword v81, v2, s[0:3], 0 offen offset:132 ; CHECK-NEXT: buffer_load_dword v80, v2, s[0:3], 0 offen offset:128 -; CHECK-NEXT: buffer_load_dword v84, v2, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_dword v85, v2, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_dword v86, v2, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v26, v2, s[0:3], 0 offen offset:124 +; CHECK-NEXT: buffer_load_dword v25, v2, s[0:3], 0 offen offset:120 +; CHECK-NEXT: buffer_load_dword v24, v2, s[0:3], 0 offen offset:116 +; CHECK-NEXT: buffer_load_dword v23, v2, s[0:3], 0 offen offset:112 +; CHECK-NEXT: buffer_load_dword v22, v2, s[0:3], 0 offen offset:108 +; CHECK-NEXT: buffer_load_dword v21, v2, s[0:3], 0 offen offset:104 +; CHECK-NEXT: buffer_load_dword v20, v2, s[0:3], 0 offen offset:100 +; CHECK-NEXT: buffer_load_dword v19, v2, s[0:3], 0 offen offset:96 +; CHECK-NEXT: buffer_load_dword v18, v2, s[0:3], 0 offen offset:92 +; CHECK-NEXT: buffer_load_dword v17, v2, s[0:3], 0 offen offset:88 +; CHECK-NEXT: buffer_load_dword v16, v2, s[0:3], 0 offen offset:84 +; CHECK-NEXT: buffer_load_dword v15, v2, s[0:3], 0 offen offset:80 +; CHECK-NEXT: buffer_load_dword v14, v2, s[0:3], 0 offen offset:76 +; CHECK-NEXT: buffer_load_dword v13, v2, s[0:3], 0 offen offset:72 +; CHECK-NEXT: buffer_load_dword v12, v2, s[0:3], 0 offen offset:68 +; CHECK-NEXT: buffer_load_dword v11, v2, s[0:3], 0 offen offset:64 +; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:48 +; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:52 +; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:56 +; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:60 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen offset:32 +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:36 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:40 +; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:44 ; CHECK-NEXT: buffer_load_dword v96, v2, s[0:3], 0 offen offset:16 ; CHECK-NEXT: buffer_load_dword v97, v2, s[0:3], 0 offen offset:20 ; CHECK-NEXT: buffer_load_dword v98, v2, s[0:3], 0 offen offset:24 ; CHECK-NEXT: buffer_load_dword v99, v2, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_load_dword v84, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v85, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v86, v2, s[0:3], 0 offen offset:8 ; CHECK-NEXT: buffer_load_dword v87, v2, s[0:3], 0 offen offset:12 ; CHECK-NEXT: v_add_co_u32 v100, vcc_lo, v0, s4 ; CHECK-NEXT: v_add_co_ci_u32_e64 v101, null, s5, v1, vcc_lo ; CHECK-NEXT: v_add_nc_u32_e32 v2, 0xffffff00, v2 ; CHECK-NEXT: s_add_u32 s4, s4, 0xffffff00 ; CHECK-NEXT: s_addc_u32 s5, s5, -1 -; CHECK-NEXT: s_waitcnt vmcnt(35) +; CHECK-NEXT: s_waitcnt vmcnt(60) ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[31:34] offset:240 -; CHECK-NEXT: s_waitcnt vmcnt(32) +; CHECK-NEXT: s_waitcnt vmcnt(56) ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[27:30] offset:224 -; CHECK-NEXT: s_waitcnt vmcnt(27) +; CHECK-NEXT: s_waitcnt vmcnt(52) ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[48:51] offset:208 -; CHECK-NEXT: s_waitcnt vmcnt(24) +; CHECK-NEXT: s_waitcnt vmcnt(48) ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[35:38] offset:192 -; CHECK-NEXT: s_waitcnt vmcnt(19) +; CHECK-NEXT: s_waitcnt vmcnt(44) ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[64:67] offset:176 -; CHECK-NEXT: s_waitcnt vmcnt(16) +; CHECK-NEXT: s_waitcnt vmcnt(40) ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[52:55] offset:160 -; CHECK-NEXT: s_waitcnt vmcnt(12) +; CHECK-NEXT: s_waitcnt vmcnt(36) ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[68:71] offset:144 -; CHECK-NEXT: s_waitcnt vmcnt(8) +; CHECK-NEXT: s_waitcnt vmcnt(32) ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[80:83] offset:128 +; CHECK-NEXT: s_waitcnt vmcnt(28) ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[23:26] offset:112 +; CHECK-NEXT: s_waitcnt vmcnt(24) ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[19:22] offset:96 +; CHECK-NEXT: s_waitcnt vmcnt(20) ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[15:18] offset:80 +; CHECK-NEXT: s_waitcnt vmcnt(16) ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[11:14] offset:64 +; CHECK-NEXT: s_waitcnt vmcnt(12) ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[7:10] offset:48 +; CHECK-NEXT: s_waitcnt vmcnt(8) ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[3:6] offset:32 -; CHECK-NEXT: s_waitcnt vmcnt(1) +; CHECK-NEXT: s_waitcnt vmcnt(4) ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[96:99] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[84:87] @@ -13483,18 +13521,20 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v109, 16, v95 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1348 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x4 -; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:12 ; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:13 +; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:12 ; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:14 ; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:15 ; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:11 +; ALIGNED-NEXT: s_waitcnt vmcnt(4) +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1376 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(3) ; ALIGNED-NEXT: v_lshl_or_b32 v95, v4, 8, v6 -; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1376 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1364 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) ; ALIGNED-NEXT: v_lshl_or_b32 v109, v0, 8, v1 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1368 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1364 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x1 ; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:9 ; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:10 @@ -15034,14 +15074,14 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1424 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_ubyte v127, v4, s[0:3], 0 offen offset:13 ; ALIGNED-NEXT: buffer_load_ubyte v76, v4, s[0:3], 0 offen offset:14 +; ALIGNED-NEXT: buffer_load_ubyte v127, v4, s[0:3], 0 offen offset:13 ; ALIGNED-NEXT: buffer_load_ubyte v104, v4, s[0:3], 0 offen offset:15 ; ALIGNED-NEXT: buffer_load_ubyte v120, v4, s[0:3], 0 offen offset:11 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: v_lshl_or_b32 v62, v127, 8, v0 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) ; ALIGNED-NEXT: buffer_store_dword v76, off, s[0:3], s32 offset:1428 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: v_lshl_or_b32 v62, v127, 8, v0 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) ; ALIGNED-NEXT: v_lshl_or_b32 v76, v104, 8, v76 ; ALIGNED-NEXT: buffer_store_dword v104, off, s[0:3], s32 offset:1432 ; 4-byte Folded Spill @@ -15857,14 +15897,14 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; UNROLL3-NEXT: .LBB9_2: ; %memmove_fwd_loop ; UNROLL3-NEXT: ; =>This Inner Loop Header: Depth=1 ; UNROLL3-NEXT: s_clause 0xb -; UNROLL3-NEXT: buffer_load_dword v4, v3, s[0:3], 0 offen -; UNROLL3-NEXT: buffer_load_dword v5, v3, s[0:3], 0 offen offset:4 -; UNROLL3-NEXT: buffer_load_dword v6, v3, s[0:3], 0 offen offset:8 -; UNROLL3-NEXT: buffer_load_dword v7, v3, s[0:3], 0 offen offset:12 ; UNROLL3-NEXT: buffer_load_dword v8, v3, s[0:3], 0 offen offset:16 ; UNROLL3-NEXT: buffer_load_dword v9, v3, s[0:3], 0 offen offset:20 ; UNROLL3-NEXT: buffer_load_dword v10, v3, s[0:3], 0 offen offset:24 ; UNROLL3-NEXT: buffer_load_dword v11, v3, s[0:3], 0 offen offset:28 +; UNROLL3-NEXT: buffer_load_dword v4, v3, s[0:3], 0 offen +; UNROLL3-NEXT: buffer_load_dword v5, v3, s[0:3], 0 offen offset:4 +; UNROLL3-NEXT: buffer_load_dword v6, v3, s[0:3], 0 offen offset:8 +; UNROLL3-NEXT: buffer_load_dword v7, v3, s[0:3], 0 offen offset:12 ; UNROLL3-NEXT: buffer_load_dword v12, v3, s[0:3], 0 offen offset:32 ; UNROLL3-NEXT: buffer_load_dword v13, v3, s[0:3], 0 offen offset:36 ; UNROLL3-NEXT: buffer_load_dword v14, v3, s[0:3], 0 offen offset:40 @@ -15874,8 +15914,9 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; UNROLL3-NEXT: s_add_u32 s4, s4, 48 ; UNROLL3-NEXT: v_add_nc_u32_e32 v3, 48, v3 ; UNROLL3-NEXT: s_addc_u32 s5, s5, 0 -; UNROLL3-NEXT: s_waitcnt vmcnt(4) +; UNROLL3-NEXT: s_waitcnt vmcnt(8) ; UNROLL3-NEXT: flat_store_dwordx4 v[16:17], v[8:11] offset:16 +; UNROLL3-NEXT: s_waitcnt vmcnt(4) ; UNROLL3-NEXT: flat_store_dwordx4 v[16:17], v[4:7] ; UNROLL3-NEXT: s_waitcnt vmcnt(0) ; UNROLL3-NEXT: flat_store_dwordx4 v[16:17], v[12:15] offset:32 @@ -15926,14 +15967,14 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; UNROLL3-NEXT: .LBB9_6: ; %memmove_bwd_loop ; UNROLL3-NEXT: ; =>This Inner Loop Header: Depth=1 ; UNROLL3-NEXT: s_clause 0xb -; UNROLL3-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen -; UNROLL3-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 -; UNROLL3-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 -; UNROLL3-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 ; UNROLL3-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:16 ; UNROLL3-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:20 ; UNROLL3-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:24 ; UNROLL3-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:28 +; UNROLL3-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; UNROLL3-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 +; UNROLL3-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 +; UNROLL3-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 ; UNROLL3-NEXT: buffer_load_dword v11, v2, s[0:3], 0 offen offset:32 ; UNROLL3-NEXT: buffer_load_dword v12, v2, s[0:3], 0 offen offset:36 ; UNROLL3-NEXT: buffer_load_dword v13, v2, s[0:3], 0 offen offset:40 @@ -15943,8 +15984,9 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; UNROLL3-NEXT: v_subrev_nc_u32_e32 v2, 48, v2 ; UNROLL3-NEXT: s_add_u32 s4, s4, 0xffffffd0 ; UNROLL3-NEXT: s_addc_u32 s5, s5, -1 -; UNROLL3-NEXT: s_waitcnt vmcnt(4) +; UNROLL3-NEXT: s_waitcnt vmcnt(8) ; UNROLL3-NEXT: flat_store_dwordx4 v[15:16], v[7:10] offset:16 +; UNROLL3-NEXT: s_waitcnt vmcnt(4) ; UNROLL3-NEXT: flat_store_dwordx4 v[15:16], v[3:6] ; UNROLL3-NEXT: s_waitcnt vmcnt(0) ; UNROLL3-NEXT: flat_store_dwordx4 v[15:16], v[11:14] offset:32 diff --git a/llvm/test/CodeGen/AMDGPU/memmove-param-combinations.ll b/llvm/test/CodeGen/AMDGPU/memmove-param-combinations.ll index 01b7f40f6256f..ba2facc5a8786 100644 --- a/llvm/test/CodeGen/AMDGPU/memmove-param-combinations.ll +++ b/llvm/test/CodeGen/AMDGPU/memmove-param-combinations.ll @@ -141,15 +141,16 @@ define void @memmove_p0_p0_sz31_align_8_8(ptr addrspace(0) align 8 %dst, ptr add ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_clause 0x4 ; CHECK-NEXT: flat_load_dword v8, v[2:3] offset:16 -; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[2:3] +; CHECK-NEXT: flat_load_ubyte v11, v[2:3] offset:30 ; CHECK-NEXT: flat_load_ushort v10, v[2:3] offset:28 ; CHECK-NEXT: flat_load_dword v9, v[2:3] offset:24 -; CHECK-NEXT: flat_load_ubyte v11, v[2:3] offset:30 +; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[2:3] ; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) ; CHECK-NEXT: flat_store_dword v[0:1], v8 offset:16 ; CHECK-NEXT: flat_load_dword v8, v[2:3] offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(2) +; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(5) ; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:30 +; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(5) ; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(2) ; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[8:9] offset:20 @@ -198,11 +199,11 @@ define void @memmove_p0_p0_sz31_align_16_16(ptr addrspace(0) align 16 %dst, ptr ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_clause 0x3 -; CHECK-NEXT: flat_load_ubyte v8, v[2:3] offset:30 ; CHECK-NEXT: flat_load_dword v9, v[2:3] offset:16 +; CHECK-NEXT: flat_load_ubyte v8, v[2:3] offset:30 ; CHECK-NEXT: flat_load_ushort v10, v[2:3] offset:28 ; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[2:3] -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) +; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) ; CHECK-NEXT: flat_store_dword v[0:1], v9 offset:16 ; CHECK-NEXT: flat_load_dword v9, v[2:3] offset:20 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -370,15 +371,16 @@ define void @memmove_p0_p1_sz31_align_8_8(ptr addrspace(0) align 8 %dst, ptr add ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_clause 0x4 ; CHECK-NEXT: global_load_dword v8, v[2:3], off offset:16 -; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off +; CHECK-NEXT: global_load_ubyte v11, v[2:3], off offset:30 ; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:28 ; CHECK-NEXT: global_load_dword v9, v[2:3], off offset:24 -; CHECK-NEXT: global_load_ubyte v11, v[2:3], off offset:30 +; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off ; CHECK-NEXT: s_waitcnt vmcnt(4) ; CHECK-NEXT: flat_store_dword v[0:1], v8 offset:16 ; CHECK-NEXT: global_load_dword v8, v[2:3], off offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(1) +; CHECK-NEXT: s_waitcnt vmcnt(4) ; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:30 +; CHECK-NEXT: s_waitcnt vmcnt(3) ; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[8:9] offset:20 @@ -427,11 +429,11 @@ define void @memmove_p0_p1_sz31_align_16_16(ptr addrspace(0) align 16 %dst, ptr ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_clause 0x3 -; CHECK-NEXT: global_load_ubyte v8, v[2:3], off offset:30 ; CHECK-NEXT: global_load_dword v9, v[2:3], off offset:16 +; CHECK-NEXT: global_load_ubyte v8, v[2:3], off offset:30 ; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:28 ; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off -; CHECK-NEXT: s_waitcnt vmcnt(2) +; CHECK-NEXT: s_waitcnt vmcnt(3) ; CHECK-NEXT: flat_store_dword v[0:1], v9 offset:16 ; CHECK-NEXT: global_load_dword v9, v[2:3], off offset:20 ; CHECK-NEXT: s_waitcnt vmcnt(0) @@ -485,14 +487,14 @@ define void @memmove_p0_p3_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p0_p3_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read_b32 v8, v2 offset:24 ; CHECK-NEXT: ds_read_u8 v9, v2 offset:30 ; CHECK-NEXT: ds_read_u16 v10, v2 offset:28 +; CHECK-NEXT: ds_read_b32 v8, v2 offset:24 ; CHECK-NEXT: ds_read_b64 v[6:7], v2 offset:16 ; CHECK-NEXT: ds_read_b128 v[2:5], v2 -; CHECK-NEXT: s_waitcnt lgkmcnt(3) +; CHECK-NEXT: s_waitcnt lgkmcnt(4) ; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30 -; CHECK-NEXT: s_waitcnt lgkmcnt(3) +; CHECK-NEXT: s_waitcnt lgkmcnt(4) ; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28 ; CHECK-NEXT: s_waitcnt lgkmcnt(3) ; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[6:8] offset:16 @@ -540,14 +542,14 @@ define void @memmove_p0_p3_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p0_p3_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read_b32 v8, v2 offset:24 ; CHECK-NEXT: ds_read_u8 v9, v2 offset:30 ; CHECK-NEXT: ds_read_u16 v10, v2 offset:28 +; CHECK-NEXT: ds_read_b32 v8, v2 offset:24 ; CHECK-NEXT: ds_read_b64 v[6:7], v2 offset:16 ; CHECK-NEXT: ds_read_b128 v[2:5], v2 -; CHECK-NEXT: s_waitcnt lgkmcnt(3) +; CHECK-NEXT: s_waitcnt lgkmcnt(4) ; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30 -; CHECK-NEXT: s_waitcnt lgkmcnt(3) +; CHECK-NEXT: s_waitcnt lgkmcnt(4) ; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28 ; CHECK-NEXT: s_waitcnt lgkmcnt(3) ; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[6:8] offset:16 @@ -819,15 +821,16 @@ define void @memmove_p0_p4_sz31_align_8_8(ptr addrspace(0) align 8 %dst, ptr add ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_clause 0x4 ; CHECK-NEXT: global_load_dword v8, v[2:3], off offset:16 -; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off +; CHECK-NEXT: global_load_ubyte v11, v[2:3], off offset:30 ; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:28 ; CHECK-NEXT: global_load_dword v9, v[2:3], off offset:24 -; CHECK-NEXT: global_load_ubyte v11, v[2:3], off offset:30 +; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off ; CHECK-NEXT: s_waitcnt vmcnt(4) ; CHECK-NEXT: flat_store_dword v[0:1], v8 offset:16 ; CHECK-NEXT: global_load_dword v8, v[2:3], off offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(1) +; CHECK-NEXT: s_waitcnt vmcnt(4) ; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:30 +; CHECK-NEXT: s_waitcnt vmcnt(3) ; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[8:9] offset:20 @@ -876,11 +879,11 @@ define void @memmove_p0_p4_sz31_align_16_16(ptr addrspace(0) align 16 %dst, ptr ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_clause 0x3 -; CHECK-NEXT: global_load_ubyte v8, v[2:3], off offset:30 ; CHECK-NEXT: global_load_dword v9, v[2:3], off offset:16 +; CHECK-NEXT: global_load_ubyte v8, v[2:3], off offset:30 ; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:28 ; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off -; CHECK-NEXT: s_waitcnt vmcnt(2) +; CHECK-NEXT: s_waitcnt vmcnt(3) ; CHECK-NEXT: flat_store_dword v[0:1], v9 offset:16 ; CHECK-NEXT: global_load_dword v9, v[2:3], off offset:20 ; CHECK-NEXT: s_waitcnt vmcnt(0) @@ -939,18 +942,20 @@ define void @memmove_p0_p5_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr add ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_clause 0x8 +; CHECK-NEXT: buffer_load_ushort v11, v2, s[0:3], 0 offen offset:28 ; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen ; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:16 ; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:20 ; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ushort v11, v2, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen ; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 ; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 ; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(3) +; CHECK-NEXT: s_waitcnt vmcnt(8) ; CHECK-NEXT: flat_store_short v[0:1], v11 offset:28 +; CHECK-NEXT: s_waitcnt vmcnt(7) ; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:30 +; CHECK-NEXT: s_waitcnt vmcnt(4) ; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[7:9] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] @@ -966,15 +971,15 @@ define void @memmove_p0_p5_sz32_align_1_1(ptr addrspace(0) align 1 %dst, ptr add ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_clause 0x7 -; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 ; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:16 ; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:20 ; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:24 ; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 ; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(1) +; CHECK-NEXT: s_waitcnt vmcnt(4) ; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[7:10] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] @@ -1008,18 +1013,20 @@ define void @memmove_p0_p5_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr add ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_clause 0x8 +; CHECK-NEXT: buffer_load_ushort v11, v2, s[0:3], 0 offen offset:28 ; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen ; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:16 ; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:20 ; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ushort v11, v2, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen ; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 ; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 ; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(3) +; CHECK-NEXT: s_waitcnt vmcnt(8) ; CHECK-NEXT: flat_store_short v[0:1], v11 offset:28 +; CHECK-NEXT: s_waitcnt vmcnt(7) ; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:30 +; CHECK-NEXT: s_waitcnt vmcnt(4) ; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[7:9] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] @@ -1035,15 +1042,15 @@ define void @memmove_p0_p5_sz32_align_2_2(ptr addrspace(0) align 2 %dst, ptr add ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_clause 0x7 -; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 ; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:16 ; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:20 ; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:24 ; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 ; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(1) +; CHECK-NEXT: s_waitcnt vmcnt(4) ; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[7:10] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] @@ -1077,19 +1084,20 @@ define void @memmove_p0_p5_sz31_align_8_8(ptr addrspace(0) align 8 %dst, ptr add ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_clause 0x8 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 ; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:16 ; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:20 ; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:30 ; CHECK-NEXT: buffer_load_ushort v11, v2, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 ; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(3) +; CHECK-NEXT: s_waitcnt vmcnt(6) ; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[7:9] offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(5) ; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(1) +; CHECK-NEXT: s_waitcnt vmcnt(4) ; CHECK-NEXT: flat_store_short v[0:1], v11 offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] @@ -1147,19 +1155,20 @@ define void @memmove_p0_p5_sz31_align_16_16(ptr addrspace(0) align 16 %dst, ptr ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_clause 0x8 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 ; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:16 ; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:20 ; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:30 ; CHECK-NEXT: buffer_load_ushort v11, v2, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 ; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(3) +; CHECK-NEXT: s_waitcnt vmcnt(6) ; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[7:9] offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(5) ; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(1) +; CHECK-NEXT: s_waitcnt vmcnt(4) ; CHECK-NEXT: flat_store_short v[0:1], v11 offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] @@ -1319,15 +1328,16 @@ define void @memmove_p1_p0_sz31_align_8_8(ptr addrspace(1) align 8 %dst, ptr add ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_clause 0x4 ; CHECK-NEXT: flat_load_dword v8, v[2:3] offset:16 -; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[2:3] +; CHECK-NEXT: flat_load_ubyte v11, v[2:3] offset:30 ; CHECK-NEXT: flat_load_ushort v10, v[2:3] offset:28 ; CHECK-NEXT: flat_load_dword v9, v[2:3] offset:24 -; CHECK-NEXT: flat_load_ubyte v11, v[2:3] offset:30 +; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[2:3] ; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) ; CHECK-NEXT: global_store_dword v[0:1], v8, off offset:16 ; CHECK-NEXT: flat_load_dword v8, v[2:3] offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) ; CHECK-NEXT: global_store_byte v[0:1], v11, off offset:30 +; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) ; CHECK-NEXT: global_store_short v[0:1], v10, off offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: global_store_dwordx2 v[0:1], v[8:9], off offset:20 @@ -1373,11 +1383,11 @@ define void @memmove_p1_p0_sz31_align_16_16(ptr addrspace(1) align 16 %dst, ptr ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_clause 0x3 -; CHECK-NEXT: flat_load_ubyte v8, v[2:3] offset:30 ; CHECK-NEXT: flat_load_dword v9, v[2:3] offset:16 +; CHECK-NEXT: flat_load_ubyte v8, v[2:3] offset:30 ; CHECK-NEXT: flat_load_ushort v10, v[2:3] offset:28 ; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[2:3] -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) +; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) ; CHECK-NEXT: global_store_dword v[0:1], v9, off offset:16 ; CHECK-NEXT: flat_load_dword v9, v[2:3] offset:20 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1536,15 +1546,16 @@ define void @memmove_p1_p1_sz31_align_8_8(ptr addrspace(1) align 8 %dst, ptr add ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_clause 0x4 ; CHECK-NEXT: global_load_dword v8, v[2:3], off offset:16 -; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off +; CHECK-NEXT: global_load_ubyte v11, v[2:3], off offset:30 ; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:28 ; CHECK-NEXT: global_load_dword v9, v[2:3], off offset:24 -; CHECK-NEXT: global_load_ubyte v11, v[2:3], off offset:30 +; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off ; CHECK-NEXT: s_waitcnt vmcnt(4) ; CHECK-NEXT: global_store_dword v[0:1], v8, off offset:16 ; CHECK-NEXT: global_load_dword v8, v[2:3], off offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(1) +; CHECK-NEXT: s_waitcnt vmcnt(4) ; CHECK-NEXT: global_store_byte v[0:1], v11, off offset:30 +; CHECK-NEXT: s_waitcnt vmcnt(3) ; CHECK-NEXT: global_store_short v[0:1], v10, off offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: global_store_dwordx2 v[0:1], v[8:9], off offset:20 @@ -1590,11 +1601,11 @@ define void @memmove_p1_p1_sz31_align_16_16(ptr addrspace(1) align 16 %dst, ptr ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_clause 0x3 -; CHECK-NEXT: global_load_ubyte v8, v[2:3], off offset:30 ; CHECK-NEXT: global_load_dword v9, v[2:3], off offset:16 +; CHECK-NEXT: global_load_ubyte v8, v[2:3], off offset:30 ; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:28 ; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off -; CHECK-NEXT: s_waitcnt vmcnt(2) +; CHECK-NEXT: s_waitcnt vmcnt(3) ; CHECK-NEXT: global_store_dword v[0:1], v9, off offset:16 ; CHECK-NEXT: global_load_dword v9, v[2:3], off offset:20 ; CHECK-NEXT: s_waitcnt vmcnt(0) @@ -1755,12 +1766,12 @@ define void @memmove_p1_p3_sz31_align_8_8(ptr addrspace(1) align 8 %dst, ptr add ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: ds_read2_b64 v[3:6], v2 offset1:1 ; CHECK-NEXT: ds_read_b32 v7, v2 offset:16 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:30 ; CHECK-NEXT: ds_read2_b32 v[8:9], v2 offset0:5 offset1:6 +; CHECK-NEXT: ds_read_u8 v10, v2 offset:30 ; CHECK-NEXT: ds_read_u16 v2, v2 offset:28 ; CHECK-NEXT: s_waitcnt lgkmcnt(4) ; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off -; CHECK-NEXT: s_waitcnt lgkmcnt(1) +; CHECK-NEXT: s_waitcnt lgkmcnt(2) ; CHECK-NEXT: global_store_dwordx3 v[0:1], v[7:9], off offset:16 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: global_store_short v[0:1], v2, off offset:28 @@ -1806,12 +1817,12 @@ define void @memmove_p1_p3_sz31_align_16_16(ptr addrspace(1) align 16 %dst, ptr ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: ds_read_b128 v[3:6], v2 ; CHECK-NEXT: ds_read_b32 v7, v2 offset:16 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:30 ; CHECK-NEXT: ds_read2_b32 v[8:9], v2 offset0:5 offset1:6 +; CHECK-NEXT: ds_read_u8 v10, v2 offset:30 ; CHECK-NEXT: ds_read_u16 v2, v2 offset:28 ; CHECK-NEXT: s_waitcnt lgkmcnt(4) ; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off -; CHECK-NEXT: s_waitcnt lgkmcnt(1) +; CHECK-NEXT: s_waitcnt lgkmcnt(2) ; CHECK-NEXT: global_store_dwordx3 v[0:1], v[7:9], off offset:16 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: global_store_short v[0:1], v2, off offset:28 @@ -1963,15 +1974,16 @@ define void @memmove_p1_p4_sz31_align_8_8(ptr addrspace(1) align 8 %dst, ptr add ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_clause 0x4 ; CHECK-NEXT: global_load_dword v8, v[2:3], off offset:16 -; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off +; CHECK-NEXT: global_load_ubyte v11, v[2:3], off offset:30 ; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:28 ; CHECK-NEXT: global_load_dword v9, v[2:3], off offset:24 -; CHECK-NEXT: global_load_ubyte v11, v[2:3], off offset:30 +; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off ; CHECK-NEXT: s_waitcnt vmcnt(4) ; CHECK-NEXT: global_store_dword v[0:1], v8, off offset:16 ; CHECK-NEXT: global_load_dword v8, v[2:3], off offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(1) +; CHECK-NEXT: s_waitcnt vmcnt(4) ; CHECK-NEXT: global_store_byte v[0:1], v11, off offset:30 +; CHECK-NEXT: s_waitcnt vmcnt(3) ; CHECK-NEXT: global_store_short v[0:1], v10, off offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: global_store_dwordx2 v[0:1], v[8:9], off offset:20 @@ -2017,11 +2029,11 @@ define void @memmove_p1_p4_sz31_align_16_16(ptr addrspace(1) align 16 %dst, ptr ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_clause 0x3 -; CHECK-NEXT: global_load_ubyte v8, v[2:3], off offset:30 ; CHECK-NEXT: global_load_dword v9, v[2:3], off offset:16 +; CHECK-NEXT: global_load_ubyte v8, v[2:3], off offset:30 ; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:28 ; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off -; CHECK-NEXT: s_waitcnt vmcnt(2) +; CHECK-NEXT: s_waitcnt vmcnt(3) ; CHECK-NEXT: global_store_dword v[0:1], v9, off offset:16 ; CHECK-NEXT: global_load_dword v9, v[2:3], off offset:20 ; CHECK-NEXT: s_waitcnt vmcnt(0) @@ -2077,20 +2089,22 @@ define void @memmove_p1_p5_sz31_align_1_1(ptr addrspace(1) align 1 %dst, ptr add ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_clause 0x8 +; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_ushort v11, v2, s[0:3], 0 offen offset:28 ; CHECK-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:30 ; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen ; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:4 ; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ushort v11, v2, s[0:3], 0 offen offset:28 ; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:12 ; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:16 ; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(4) +; CHECK-NEXT: s_waitcnt vmcnt(8) ; CHECK-NEXT: global_store_dword v[0:1], v10, off offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(3) +; CHECK-NEXT: s_waitcnt vmcnt(7) ; CHECK-NEXT: global_store_short v[0:1], v11, off offset:28 +; CHECK-NEXT: s_waitcnt vmcnt(6) ; CHECK-NEXT: global_store_byte v[0:1], v9, off offset:30 +; CHECK-NEXT: s_waitcnt vmcnt(4) ; CHECK-NEXT: global_store_dwordx2 v[0:1], v[7:8], off ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off offset:8 @@ -2145,20 +2159,22 @@ define void @memmove_p1_p5_sz31_align_2_2(ptr addrspace(1) align 2 %dst, ptr add ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_clause 0x8 +; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_ushort v11, v2, s[0:3], 0 offen offset:28 ; CHECK-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:30 ; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen ; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:4 ; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ushort v11, v2, s[0:3], 0 offen offset:28 ; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:12 ; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:16 ; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(4) +; CHECK-NEXT: s_waitcnt vmcnt(8) ; CHECK-NEXT: global_store_dword v[0:1], v10, off offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(3) +; CHECK-NEXT: s_waitcnt vmcnt(7) ; CHECK-NEXT: global_store_short v[0:1], v11, off offset:28 +; CHECK-NEXT: s_waitcnt vmcnt(6) ; CHECK-NEXT: global_store_byte v[0:1], v9, off offset:30 +; CHECK-NEXT: s_waitcnt vmcnt(4) ; CHECK-NEXT: global_store_dwordx2 v[0:1], v[7:8], off ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off offset:8 @@ -2213,18 +2229,20 @@ define void @memmove_p1_p5_sz31_align_8_8(ptr addrspace(1) align 8 %dst, ptr add ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_clause 0x8 +; CHECK-NEXT: buffer_load_ushort v11, v2, s[0:3], 0 offen offset:28 ; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:30 ; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen ; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 ; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 ; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ushort v11, v2, s[0:3], 0 offen offset:28 ; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:16 ; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:20 ; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(3) +; CHECK-NEXT: s_waitcnt vmcnt(8) ; CHECK-NEXT: global_store_short v[0:1], v11, off offset:28 +; CHECK-NEXT: s_waitcnt vmcnt(7) ; CHECK-NEXT: global_store_byte v[0:1], v10, off offset:30 +; CHECK-NEXT: s_waitcnt vmcnt(3) ; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: global_store_dwordx3 v[0:1], v[7:9], off offset:16 @@ -2279,18 +2297,20 @@ define void @memmove_p1_p5_sz31_align_16_16(ptr addrspace(1) align 16 %dst, ptr ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_clause 0x8 +; CHECK-NEXT: buffer_load_ushort v11, v2, s[0:3], 0 offen offset:28 ; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:30 ; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen ; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 ; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 ; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ushort v11, v2, s[0:3], 0 offen offset:28 ; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:16 ; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:20 ; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(3) +; CHECK-NEXT: s_waitcnt vmcnt(8) ; CHECK-NEXT: global_store_short v[0:1], v11, off offset:28 +; CHECK-NEXT: s_waitcnt vmcnt(7) ; CHECK-NEXT: global_store_byte v[0:1], v10, off offset:30 +; CHECK-NEXT: s_waitcnt vmcnt(3) ; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: global_store_dwordx3 v[0:1], v[7:9], off offset:16 @@ -2513,11 +2533,11 @@ define void @memmove_p3_p0_sz31_align_16_16(ptr addrspace(3) align 16 %dst, ptr ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_clause 0x3 -; CHECK-NEXT: flat_load_ubyte v7, v[1:2] offset:30 ; CHECK-NEXT: flat_load_dword v8, v[1:2] offset:16 +; CHECK-NEXT: flat_load_ubyte v7, v[1:2] offset:30 ; CHECK-NEXT: flat_load_ushort v9, v[1:2] offset:28 ; CHECK-NEXT: flat_load_dwordx4 v[3:6], v[1:2] -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) +; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) ; CHECK-NEXT: ds_write_b32 v0, v8 offset:16 ; CHECK-NEXT: flat_load_dword v8, v[1:2] offset:20 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3260,19 +3280,20 @@ define void @memmove_p3_p5_sz31_align_1_1(ptr addrspace(3) align 1 %dst, ptr add ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_clause 0x8 +; CHECK-NEXT: buffer_load_dword v9, v1, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_ushort v10, v1, s[0:3], 0 offen offset:28 ; CHECK-NEXT: buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:30 ; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen ; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:4 ; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_dword v9, v1, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ushort v10, v1, s[0:3], 0 offen offset:28 ; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:12 ; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:16 ; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(4) +; CHECK-NEXT: s_waitcnt vmcnt(8) ; CHECK-NEXT: ds_write_b32 v0, v9 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(3) +; CHECK-NEXT: s_waitcnt vmcnt(7) ; CHECK-NEXT: ds_write_b16 v0, v10 offset:28 +; CHECK-NEXT: s_waitcnt vmcnt(6) ; CHECK-NEXT: ds_write_b8 v0, v8 offset:30 ; CHECK-NEXT: s_waitcnt vmcnt(2) ; CHECK-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset1:1 @@ -3332,19 +3353,20 @@ define void @memmove_p3_p5_sz31_align_2_2(ptr addrspace(3) align 2 %dst, ptr add ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_clause 0x8 +; CHECK-NEXT: buffer_load_dword v9, v1, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_ushort v10, v1, s[0:3], 0 offen offset:28 ; CHECK-NEXT: buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:30 ; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen ; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:4 ; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_dword v9, v1, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ushort v10, v1, s[0:3], 0 offen offset:28 ; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:12 ; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:16 ; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(4) +; CHECK-NEXT: s_waitcnt vmcnt(8) ; CHECK-NEXT: ds_write_b32 v0, v9 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(3) +; CHECK-NEXT: s_waitcnt vmcnt(7) ; CHECK-NEXT: ds_write_b16 v0, v10 offset:28 +; CHECK-NEXT: s_waitcnt vmcnt(6) ; CHECK-NEXT: ds_write_b8 v0, v8 offset:30 ; CHECK-NEXT: s_waitcnt vmcnt(2) ; CHECK-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset1:1 @@ -3477,20 +3499,22 @@ define void @memmove_p3_p5_sz31_align_16_16(ptr addrspace(3) align 16 %dst, ptr ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_clause 0x8 -; CHECK-NEXT: buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:16 ; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen offset:20 ; CHECK-NEXT: buffer_load_dword v9, v1, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:16 ; CHECK-NEXT: buffer_load_ushort v10, v1, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:30 +; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:4 ; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:8 ; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(4) +; CHECK-NEXT: s_waitcnt vmcnt(7) ; CHECK-NEXT: ds_write2_b32 v0, v8, v9 offset0:5 offset1:6 +; CHECK-NEXT: s_waitcnt vmcnt(6) ; CHECK-NEXT: ds_write_b32 v0, v7 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(2) +; CHECK-NEXT: s_waitcnt vmcnt(5) ; CHECK-NEXT: ds_write_b16 v0, v10 offset:28 +; CHECK-NEXT: s_waitcnt vmcnt(4) ; CHECK-NEXT: ds_write_b8 v0, v6 offset:30 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: ds_write_b128 v0, v[2:5] @@ -3751,11 +3775,11 @@ define void @memmove_p5_p0_sz31_align_16_16(ptr addrspace(5) align 16 %dst, ptr ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_clause 0x3 -; CHECK-NEXT: flat_load_ubyte v7, v[1:2] offset:30 ; CHECK-NEXT: flat_load_dword v8, v[1:2] offset:16 +; CHECK-NEXT: flat_load_ubyte v7, v[1:2] offset:30 ; CHECK-NEXT: flat_load_ushort v9, v[1:2] offset:28 ; CHECK-NEXT: flat_load_dwordx4 v[3:6], v[1:2] -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) +; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) ; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: flat_load_dword v8, v[1:2] offset:20 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/move-to-valu-ctlz-cttz.ll b/llvm/test/CodeGen/AMDGPU/move-to-valu-ctlz-cttz.ll index 30ad3be46053c..344788fff76ee 100644 --- a/llvm/test/CodeGen/AMDGPU/move-to-valu-ctlz-cttz.ll +++ b/llvm/test/CodeGen/AMDGPU/move-to-valu-ctlz-cttz.ll @@ -12,23 +12,24 @@ define amdgpu_kernel void @ctlz_i64_poison(ptr addrspace(1) noalias %out, ptr ad ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ubyte v0, v1, s[2:3] offset:5 -; GFX9-NEXT: global_load_ubyte v2, v1, s[2:3] offset:6 ; GFX9-NEXT: global_load_ubyte v3, v1, s[2:3] offset:7 ; GFX9-NEXT: global_load_ubyte v4, v1, s[2:3] offset:1 ; GFX9-NEXT: global_load_ubyte v5, v1, s[2:3] offset:3 ; GFX9-NEXT: global_load_ubyte v6, v1, s[2:3] offset:4 +; GFX9-NEXT: global_load_ubyte v2, v1, s[2:3] offset:6 ; GFX9-NEXT: global_load_ubyte v7, v1, s[2:3] ; GFX9-NEXT: global_load_ubyte v8, v1, s[2:3] offset:2 ; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5 -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_or_b32_e32 v0, v0, v6 +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_e32 v3, v4, v7 @@ -49,26 +50,29 @@ define amdgpu_kernel void @ctlz_i64_poison(ptr addrspace(1) noalias %out, ptr ad ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x7 -; GFX10-NEXT: global_load_ubyte v0, v1, s[2:3] offset:5 -; GFX10-NEXT: global_load_ubyte v2, v1, s[2:3] offset:6 -; GFX10-NEXT: global_load_ubyte v3, v1, s[2:3] offset:7 ; GFX10-NEXT: global_load_ubyte v4, v1, s[2:3] offset:1 ; GFX10-NEXT: global_load_ubyte v5, v1, s[2:3] offset:3 +; GFX10-NEXT: global_load_ubyte v0, v1, s[2:3] offset:5 +; GFX10-NEXT: global_load_ubyte v3, v1, s[2:3] offset:7 ; GFX10-NEXT: global_load_ubyte v6, v1, s[2:3] ; GFX10-NEXT: global_load_ubyte v7, v1, s[2:3] offset:2 ; GFX10-NEXT: global_load_ubyte v8, v1, s[2:3] offset:4 +; GFX10-NEXT: global_load_ubyte v2, v1, s[2:3] offset:6 ; GFX10-NEXT: s_waitcnt vmcnt(7) -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; GFX10-NEXT: s_waitcnt vmcnt(6) +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; GFX10-NEXT: s_waitcnt vmcnt(5) -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 8, v0 ; GFX10-NEXT: s_waitcnt vmcnt(4) -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; GFX10-NEXT: s_waitcnt vmcnt(3) -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 8, v5 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_or_b32_e32 v0, v0, v8 ; GFX10-NEXT: v_or_b32_e32 v4, v4, v6 +; GFX10-NEXT: s_waitcnt vmcnt(2) ; GFX10-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: s_waitcnt vmcnt(1) +; GFX10-NEXT: v_or_b32_e32 v0, v0, v8 +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: v_or_b32_e32 v3, v5, v4 ; GFX10-NEXT: v_or_b32_e32 v0, v2, v0 @@ -91,23 +95,24 @@ define amdgpu_kernel void @ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspace ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ubyte v0, v1, s[2:3] offset:5 -; GFX9-NEXT: global_load_ubyte v2, v1, s[2:3] offset:6 ; GFX9-NEXT: global_load_ubyte v3, v1, s[2:3] offset:7 ; GFX9-NEXT: global_load_ubyte v4, v1, s[2:3] offset:1 ; GFX9-NEXT: global_load_ubyte v5, v1, s[2:3] offset:3 ; GFX9-NEXT: global_load_ubyte v6, v1, s[2:3] offset:4 +; GFX9-NEXT: global_load_ubyte v2, v1, s[2:3] offset:6 ; GFX9-NEXT: global_load_ubyte v7, v1, s[2:3] ; GFX9-NEXT: global_load_ubyte v8, v1, s[2:3] offset:2 ; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5 -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_or_b32_e32 v0, v0, v6 +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_e32 v3, v4, v7 @@ -129,26 +134,29 @@ define amdgpu_kernel void @ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspace ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x7 -; GFX10-NEXT: global_load_ubyte v0, v1, s[2:3] offset:5 -; GFX10-NEXT: global_load_ubyte v2, v1, s[2:3] offset:6 -; GFX10-NEXT: global_load_ubyte v3, v1, s[2:3] offset:7 ; GFX10-NEXT: global_load_ubyte v4, v1, s[2:3] offset:1 ; GFX10-NEXT: global_load_ubyte v5, v1, s[2:3] offset:3 +; GFX10-NEXT: global_load_ubyte v0, v1, s[2:3] offset:5 +; GFX10-NEXT: global_load_ubyte v3, v1, s[2:3] offset:7 ; GFX10-NEXT: global_load_ubyte v6, v1, s[2:3] ; GFX10-NEXT: global_load_ubyte v7, v1, s[2:3] offset:2 ; GFX10-NEXT: global_load_ubyte v8, v1, s[2:3] offset:4 +; GFX10-NEXT: global_load_ubyte v2, v1, s[2:3] offset:6 ; GFX10-NEXT: s_waitcnt vmcnt(7) -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; GFX10-NEXT: s_waitcnt vmcnt(6) +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; GFX10-NEXT: s_waitcnt vmcnt(5) -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 8, v0 ; GFX10-NEXT: s_waitcnt vmcnt(4) -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; GFX10-NEXT: s_waitcnt vmcnt(3) -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 8, v5 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_or_b32_e32 v0, v0, v8 ; GFX10-NEXT: v_or_b32_e32 v4, v4, v6 +; GFX10-NEXT: s_waitcnt vmcnt(2) ; GFX10-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: s_waitcnt vmcnt(1) +; GFX10-NEXT: v_or_b32_e32 v0, v0, v8 +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: v_or_b32_e32 v3, v5, v4 ; GFX10-NEXT: v_or_b32_e32 v0, v2, v0 @@ -172,23 +180,24 @@ define amdgpu_kernel void @cttz_i64_poison(ptr addrspace(1) noalias %out, ptr ad ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ubyte v0, v1, s[2:3] offset:5 -; GFX9-NEXT: global_load_ubyte v2, v1, s[2:3] offset:6 ; GFX9-NEXT: global_load_ubyte v3, v1, s[2:3] offset:7 ; GFX9-NEXT: global_load_ubyte v4, v1, s[2:3] offset:1 ; GFX9-NEXT: global_load_ubyte v5, v1, s[2:3] offset:3 ; GFX9-NEXT: global_load_ubyte v6, v1, s[2:3] offset:4 +; GFX9-NEXT: global_load_ubyte v2, v1, s[2:3] offset:6 ; GFX9-NEXT: global_load_ubyte v7, v1, s[2:3] ; GFX9-NEXT: global_load_ubyte v8, v1, s[2:3] offset:2 ; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5 -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_or_b32_e32 v0, v0, v6 +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_e32 v3, v4, v7 @@ -211,22 +220,23 @@ define amdgpu_kernel void @cttz_i64_poison(ptr addrspace(1) noalias %out, ptr ad ; GFX10-NEXT: s_clause 0x7 ; GFX10-NEXT: global_load_ubyte v0, v1, s[2:3] offset:5 ; GFX10-NEXT: global_load_ubyte v2, v1, s[2:3] offset:7 -; GFX10-NEXT: global_load_ubyte v3, v1, s[2:3] offset:6 ; GFX10-NEXT: global_load_ubyte v4, v1, s[2:3] offset:1 ; GFX10-NEXT: global_load_ubyte v5, v1, s[2:3] offset:3 ; GFX10-NEXT: global_load_ubyte v6, v1, s[2:3] offset:4 +; GFX10-NEXT: global_load_ubyte v3, v1, s[2:3] offset:6 ; GFX10-NEXT: global_load_ubyte v7, v1, s[2:3] ; GFX10-NEXT: global_load_ubyte v8, v1, s[2:3] offset:2 ; GFX10-NEXT: s_waitcnt vmcnt(7) ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 8, v0 ; GFX10-NEXT: s_waitcnt vmcnt(6) ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; GFX10-NEXT: s_waitcnt vmcnt(4) +; GFX10-NEXT: s_waitcnt vmcnt(5) ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; GFX10-NEXT: s_waitcnt vmcnt(3) +; GFX10-NEXT: s_waitcnt vmcnt(4) ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 8, v5 -; GFX10-NEXT: s_waitcnt vmcnt(2) +; GFX10-NEXT: s_waitcnt vmcnt(3) ; GFX10-NEXT: v_or_b32_e32 v0, v0, v6 +; GFX10-NEXT: s_waitcnt vmcnt(2) ; GFX10-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: s_waitcnt vmcnt(1) ; GFX10-NEXT: v_or_b32_e32 v3, v4, v7 @@ -253,23 +263,24 @@ define amdgpu_kernel void @cttz_i64(ptr addrspace(1) noalias %out, ptr addrspace ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ubyte v0, v1, s[2:3] offset:5 -; GFX9-NEXT: global_load_ubyte v2, v1, s[2:3] offset:6 ; GFX9-NEXT: global_load_ubyte v3, v1, s[2:3] offset:7 ; GFX9-NEXT: global_load_ubyte v4, v1, s[2:3] offset:1 ; GFX9-NEXT: global_load_ubyte v5, v1, s[2:3] offset:3 ; GFX9-NEXT: global_load_ubyte v6, v1, s[2:3] offset:4 +; GFX9-NEXT: global_load_ubyte v2, v1, s[2:3] offset:6 ; GFX9-NEXT: global_load_ubyte v7, v1, s[2:3] ; GFX9-NEXT: global_load_ubyte v8, v1, s[2:3] offset:2 ; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5 -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_or_b32_e32 v0, v0, v6 +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_e32 v3, v4, v7 @@ -293,22 +304,23 @@ define amdgpu_kernel void @cttz_i64(ptr addrspace(1) noalias %out, ptr addrspace ; GFX10-NEXT: s_clause 0x7 ; GFX10-NEXT: global_load_ubyte v0, v1, s[2:3] offset:5 ; GFX10-NEXT: global_load_ubyte v2, v1, s[2:3] offset:7 -; GFX10-NEXT: global_load_ubyte v3, v1, s[2:3] offset:6 ; GFX10-NEXT: global_load_ubyte v4, v1, s[2:3] offset:1 ; GFX10-NEXT: global_load_ubyte v5, v1, s[2:3] offset:3 ; GFX10-NEXT: global_load_ubyte v6, v1, s[2:3] offset:4 +; GFX10-NEXT: global_load_ubyte v3, v1, s[2:3] offset:6 ; GFX10-NEXT: global_load_ubyte v7, v1, s[2:3] ; GFX10-NEXT: global_load_ubyte v8, v1, s[2:3] offset:2 ; GFX10-NEXT: s_waitcnt vmcnt(7) ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 8, v0 ; GFX10-NEXT: s_waitcnt vmcnt(6) ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; GFX10-NEXT: s_waitcnt vmcnt(4) +; GFX10-NEXT: s_waitcnt vmcnt(5) ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; GFX10-NEXT: s_waitcnt vmcnt(3) +; GFX10-NEXT: s_waitcnt vmcnt(4) ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 8, v5 -; GFX10-NEXT: s_waitcnt vmcnt(2) +; GFX10-NEXT: s_waitcnt vmcnt(3) ; GFX10-NEXT: v_or_b32_e32 v0, v0, v6 +; GFX10-NEXT: s_waitcnt vmcnt(2) ; GFX10-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: s_waitcnt vmcnt(1) ; GFX10-NEXT: v_or_b32_e32 v3, v4, v7 diff --git a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll index cf0fbe4506d20..ae57b8623f20b 100644 --- a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll +++ b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll @@ -677,13 +677,13 @@ define void @mubuf_vgpr_adjacent_in_block(<4 x i32> %i, <4 x i32> %j, i32 %c, pt ; W64-O0-NEXT: v_readlane_b32 s4, v17, 9 ; W64-O0-NEXT: v_readlane_b32 s5, v17, 10 ; W64-O0-NEXT: s_mov_b64 exec, s[4:5] -; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; W64-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; W64-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; W64-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; W64-O0-NEXT: s_waitcnt vmcnt(0) +; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; W64-O0-NEXT: s_waitcnt vmcnt(3) ; W64-O0-NEXT: global_store_dword v[3:4], v5, off ; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: global_store_dword v[0:1], v2, off @@ -1118,12 +1118,13 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad ; W64-O0-NEXT: v_readlane_b32 s7, v13, 3 ; W64-O0-NEXT: s_mov_b64 exec, s[6:7] ; W64-O0-NEXT: v_readlane_b32 s4, v13, 1 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b32 s5, 0x3ff -; W64-O0-NEXT: s_waitcnt vmcnt(0) +; W64-O0-NEXT: s_waitcnt vmcnt(1) ; W64-O0-NEXT: v_and_b32_e64 v1, v1, s5 ; W64-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v1, s4 +; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; W64-O0-NEXT: s_mov_b64 s[4:5], exec ; W64-O0-NEXT: v_writelane_b32 v13, s4, 10 diff --git a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll index 6368030b445fe..1c13a21f781d0 100644 --- a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll +++ b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll @@ -697,13 +697,13 @@ define void @mubuf_vgpr_adjacent_in_block(ptr addrspace(8) %i, ptr addrspace(8) ; W64-O0-NEXT: v_readlane_b32 s4, v17, 9 ; W64-O0-NEXT: v_readlane_b32 s5, v17, 10 ; W64-O0-NEXT: s_mov_b64 exec, s[4:5] -; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; W64-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; W64-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; W64-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; W64-O0-NEXT: s_waitcnt vmcnt(0) +; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; W64-O0-NEXT: s_waitcnt vmcnt(3) ; W64-O0-NEXT: global_store_dword v[3:4], v5, off ; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: global_store_dword v[0:1], v2, off @@ -1149,12 +1149,13 @@ define void @mubuf_vgpr_outside_entry(ptr addrspace(8) %i, ptr addrspace(8) %j, ; W64-O0-NEXT: v_readlane_b32 s7, v13, 3 ; W64-O0-NEXT: s_mov_b64 exec, s[6:7] ; W64-O0-NEXT: v_readlane_b32 s4, v13, 1 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b32 s5, 0x3ff -; W64-O0-NEXT: s_waitcnt vmcnt(0) +; W64-O0-NEXT: s_waitcnt vmcnt(1) ; W64-O0-NEXT: v_and_b32_e64 v1, v1, s5 ; W64-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v1, s4 +; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; W64-O0-NEXT: s_mov_b64 s[4:5], exec ; W64-O0-NEXT: v_writelane_b32 v13, s4, 10 @@ -1171,14 +1172,17 @@ define void @mubuf_vgpr_outside_entry(ptr addrspace(8) %i, ptr addrspace(8) %j, ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: v_readlane_b32 s4, v13, 0 -; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; W64-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; W64-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; W64-O0-NEXT: s_waitcnt vmcnt(0) +; W64-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; W64-O0-NEXT: s_waitcnt vmcnt(3) ; W64-O0-NEXT: v_mov_b32_e32 v6, v4 +; W64-O0-NEXT: s_waitcnt vmcnt(2) ; W64-O0-NEXT: v_mov_b32_e32 v0, v3 +; W64-O0-NEXT: s_waitcnt vmcnt(1) ; W64-O0-NEXT: v_mov_b32_e32 v4, v2 +; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: v_mov_b32_e32 v5, v1 ; W64-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec ; W64-O0-NEXT: v_mov_b32_e32 v1, v6 diff --git a/llvm/test/CodeGen/AMDGPU/permute_i8.ll b/llvm/test/CodeGen/AMDGPU/permute_i8.ll index 0741cb256cc24..eba995e13e448 100644 --- a/llvm/test/CodeGen/AMDGPU/permute_i8.ll +++ b/llvm/test/CodeGen/AMDGPU/permute_i8.ll @@ -3641,11 +3641,11 @@ define hidden void @extract_v6i16(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_clause 0x3 -; GFX10-NEXT: global_load_ushort v2, v[0:1], off offset:6 ; GFX10-NEXT: global_load_ushort v3, v[0:1], off ; GFX10-NEXT: global_load_ushort v8, v[0:1], off offset:2 +; GFX10-NEXT: global_load_ushort v2, v[0:1], off offset:6 ; GFX10-NEXT: global_load_ushort v9, v[0:1], off offset:4 -; GFX10-NEXT: s_waitcnt vmcnt(1) +; GFX10-NEXT: s_waitcnt vmcnt(2) ; GFX10-NEXT: v_lshl_or_b32 v0, v8, 16, v3 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v9 diff --git a/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll b/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll index f4a9e7e8f2759..994abb653e086 100644 --- a/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll +++ b/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll @@ -306,45 +306,45 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 { ; GFX906-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX906-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload ; GFX906-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v6, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:60 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:64 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:68 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:72 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:76 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:80 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:84 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:88 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:92 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:96 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:100 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v23, off, s[0:3], s33 offset:104 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v24, off, s[0:3], s33 offset:108 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v25, off, s[0:3], s33 offset:112 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v26, off, s[0:3], s33 offset:116 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v27, off, s[0:3], s33 offset:120 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v28, off, s[0:3], s33 offset:124 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v29, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload ; GFX906-NEXT: buffer_load_dword v30, off, s[0:3], s33 offset:132 ; 4-byte Folded Reload ; GFX906-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:136 ; 4-byte Folded Reload ; GFX906-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:140 ; 4-byte Folded Reload ; GFX906-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:144 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v26, off, s[0:3], s33 offset:116 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v27, off, s[0:3], s33 offset:120 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v28, off, s[0:3], s33 offset:124 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v29, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:100 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v23, off, s[0:3], s33 offset:104 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v24, off, s[0:3], s33 offset:108 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v25, off, s[0:3], s33 offset:112 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:84 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:88 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:92 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:96 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:68 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:72 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:76 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:80 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:60 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:64 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v6, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload ; GFX906-NEXT: v_readlane_b32 s31, v41, 1 ; GFX906-NEXT: v_readlane_b32 s30, v41, 0 ; GFX906-NEXT: s_mov_b32 s32, s33 ; GFX906-NEXT: v_readlane_b32 s4, v41, 4 ; GFX906-NEXT: v_readlane_b32 s34, v41, 2 ; GFX906-NEXT: v_readlane_b32 s35, v41, 3 -; GFX906-NEXT: s_waitcnt vmcnt(0) +; GFX906-NEXT: s_waitcnt vmcnt(28) ; GFX906-NEXT: flat_store_dwordx4 v[0:1], v[30:33] offset:112 ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: flat_store_dwordx4 v[0:1], v[26:29] offset:96 @@ -685,40 +685,40 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 { ; GFX908-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX908-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload ; GFX908-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v6, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:60 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:64 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:68 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:72 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:76 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:80 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:84 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:88 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:92 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:96 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:100 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:104 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:108 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v23, off, s[0:3], s33 offset:112 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v24, off, s[0:3], s33 offset:116 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v25, off, s[0:3], s33 offset:120 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v26, off, s[0:3], s33 offset:124 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v27, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v28, off, s[0:3], s33 offset:132 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v29, off, s[0:3], s33 offset:136 ; 4-byte Folded Reload ; GFX908-NEXT: buffer_load_dword v30, off, s[0:3], s33 offset:140 ; 4-byte Folded Reload ; GFX908-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:144 ; 4-byte Folded Reload ; GFX908-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:148 ; 4-byte Folded Reload ; GFX908-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:152 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v26, off, s[0:3], s33 offset:124 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v27, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v28, off, s[0:3], s33 offset:132 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v29, off, s[0:3], s33 offset:136 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:108 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v23, off, s[0:3], s33 offset:112 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v24, off, s[0:3], s33 offset:116 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v25, off, s[0:3], s33 offset:120 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:92 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:96 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:100 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:104 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:76 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:80 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:84 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:88 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:60 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:64 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:68 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:72 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v6, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload ; GFX908-NEXT: s_mov_b64 s[4:5], exec -; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: s_waitcnt vmcnt(28) ; GFX908-NEXT: flat_store_dwordx4 v[0:1], v[30:33] offset:112 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: flat_store_dwordx4 v[0:1], v[26:29] offset:96 diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll index 85a9aba1a0e51..10ba7d8365d7f 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll @@ -2163,9 +2163,9 @@ define amdgpu_kernel void @ReverseOrder(ptr addrspace(1) %buffer) { ; GFX9-NEXT: s_movk_i32 s0, 0x1000 ; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, s0, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dwordx2 v[14:15], v[12:13], off ; GFX9-NEXT: global_load_dwordx2 v[16:17], v[2:3], off ; GFX9-NEXT: global_load_dwordx2 v[18:19], v[12:13], off offset:2048 +; GFX9-NEXT: global_load_dwordx2 v[14:15], v[12:13], off ; GFX9-NEXT: global_load_dwordx2 v[20:21], v[0:1], off offset:2048 ; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v4 @@ -2176,12 +2176,13 @@ define amdgpu_kernel void @ReverseOrder(ptr addrspace(1) %buffer) { ; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v10, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v11, v1, vcc -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v16, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v17, v1, vcc -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v18, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v19, v1, vcc +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v14, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v15, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -2302,8 +2303,8 @@ define amdgpu_kernel void @ReverseOrder(ptr addrspace(1) %buffer) { ; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v1, vcc_lo ; GFX11-NEXT: s_clause 0x4 ; GFX11-NEXT: global_load_b64 v[12:13], v[8:9], off offset:2048 -; GFX11-NEXT: global_load_b64 v[14:15], v[10:11], off ; GFX11-NEXT: global_load_b64 v[8:9], v[8:9], off +; GFX11-NEXT: global_load_b64 v[14:15], v[10:11], off ; GFX11-NEXT: global_load_b64 v[10:11], v[10:11], off offset:2048 ; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off offset:2048 ; GFX11-NEXT: s_waitcnt vmcnt(6) @@ -2318,7 +2319,7 @@ define amdgpu_kernel void @ReverseOrder(ptr addrspace(1) %buffer) { ; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v12, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v13, v3, vcc_lo -; GFX11-NEXT: s_waitcnt vmcnt(2) +; GFX11-NEXT: s_waitcnt vmcnt(3) ; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v8, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v9, v3, vcc_lo diff --git a/llvm/test/CodeGen/AMDGPU/rem_i128.ll b/llvm/test/CodeGen/AMDGPU/rem_i128.ll index 8abbdad893819..d6e62104dcfdb 100644 --- a/llvm/test/CodeGen/AMDGPU/rem_i128.ll +++ b/llvm/test/CodeGen/AMDGPU/rem_i128.ll @@ -586,47 +586,52 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_readlane_b32 s4, v30, 4 ; GFX9-O0-NEXT: v_readlane_b32 s5, v30, 5 ; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_waitcnt vmcnt(1) +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_waitcnt vmcnt(3) ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_waitcnt vmcnt(1) +; GFX9-O0-NEXT: s_waitcnt vmcnt(3) ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(3) ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: s_waitcnt vmcnt(3) ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB0_9 ; GFX9-O0-NEXT: .LBB0_4: ; %udiv-loop-exit -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b32 s4, 1 -; GFX9-O0-NEXT: s_waitcnt vmcnt(2) +; GFX9-O0-NEXT: s_waitcnt vmcnt(6) ; GFX9-O0-NEXT: v_lshlrev_b64 v[2:3], s4, v[0:1] -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: s_waitcnt vmcnt(4) ; GFX9-O0-NEXT: v_lshlrev_b64 v[9:10], s4, v[9:10] ; GFX9-O0-NEXT: s_mov_b32 s4, 63 ; GFX9-O0-NEXT: v_lshrrev_b64 v[0:1], s4, v[0:1] ; GFX9-O0-NEXT: v_mov_b32_e32 v11, v1 ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v10 +; GFX9-O0-NEXT: s_waitcnt vmcnt(3) ; GFX9-O0-NEXT: v_mov_b32_e32 v12, v8 ; GFX9-O0-NEXT: v_or3_b32 v4, v4, v11, v12 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v9 +; GFX9-O0-NEXT: s_waitcnt vmcnt(2) ; GFX9-O0-NEXT: v_or3_b32 v0, v0, v1, v7 ; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4 ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-O0-NEXT: s_waitcnt vmcnt(1) ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v6 ; GFX9-O0-NEXT: v_or_b32_e64 v4, v4, v7 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v2 +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v5 ; GFX9-O0-NEXT: v_or_b32_e64 v2, v2, v3 ; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec @@ -646,26 +651,29 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_readlane_b32 s4, v30, 8 ; GFX9-O0-NEXT: v_readlane_b32 s5, v30, 9 ; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_waitcnt vmcnt(1) +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_waitcnt vmcnt(1) +; GFX9-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB0_4 ; GFX9-O0-NEXT: .LBB0_6: ; %udiv-do-while @@ -676,35 +684,36 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_readlane_b32 s6, v30, 10 ; GFX9-O0-NEXT: v_readlane_b32 s7, v30, 11 +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b32 s4, 63 -; GFX9-O0-NEXT: s_waitcnt vmcnt(16) +; GFX9-O0-NEXT: s_waitcnt vmcnt(22) ; GFX9-O0-NEXT: v_lshrrev_b64 v[28:29], s4, v[2:3] ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v29 ; GFX9-O0-NEXT: s_mov_b32 s5, 1 +; GFX9-O0-NEXT: s_waitcnt vmcnt(20) ; GFX9-O0-NEXT: v_lshlrev_b64 v[22:23], s5, v[22:23] ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v23 ; GFX9-O0-NEXT: v_or_b32_e64 v4, v4, v5 @@ -714,6 +723,7 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: ; kill: def $vgpr22 killed $vgpr22 def $vgpr22_vgpr23 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v23, v4 ; GFX9-O0-NEXT: v_lshlrev_b64 v[28:29], s5, v[2:3] +; GFX9-O0-NEXT: s_waitcnt vmcnt(18) ; GFX9-O0-NEXT: v_lshrrev_b64 v[4:5], s4, v[6:7] ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v29 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v5 @@ -723,25 +733,28 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_or_b32_e64 v4, v3, v4 ; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v2 +; GFX9-O0-NEXT: s_waitcnt vmcnt(16) ; GFX9-O0-NEXT: v_lshlrev_b64 v[2:3], s5, v[0:1] ; GFX9-O0-NEXT: v_lshlrev_b64 v[28:29], s5, v[6:7] ; GFX9-O0-NEXT: v_lshrrev_b64 v[0:1], s4, v[0:1] ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v1 ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v29 -; GFX9-O0-NEXT: s_waitcnt vmcnt(10) +; GFX9-O0-NEXT: s_waitcnt vmcnt(15) ; GFX9-O0-NEXT: v_mov_b32_e32 v10, v27 ; GFX9-O0-NEXT: v_or3_b32 v6, v6, v7, v10 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v28 +; GFX9-O0-NEXT: s_waitcnt vmcnt(14) ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v26 ; GFX9-O0-NEXT: v_or3_b32 v0, v0, v1, v7 ; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v6 ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v3 -; GFX9-O0-NEXT: s_waitcnt vmcnt(8) +; GFX9-O0-NEXT: s_waitcnt vmcnt(13) ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v25 ; GFX9-O0-NEXT: v_or_b32_e64 v6, v6, v7 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v2 +; GFX9-O0-NEXT: s_waitcnt vmcnt(12) ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v24 ; GFX9-O0-NEXT: v_or_b32_e64 v2, v2, v3 ; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec @@ -750,12 +763,14 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v10, v5 ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v22 ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v23 -; GFX9-O0-NEXT: s_waitcnt vmcnt(1) +; GFX9-O0-NEXT: s_waitcnt vmcnt(11) ; GFX9-O0-NEXT: v_mov_b32_e32 v13, v11 +; GFX9-O0-NEXT: s_waitcnt vmcnt(10) ; GFX9-O0-NEXT: v_mov_b32_e32 v11, v14 +; GFX9-O0-NEXT: s_waitcnt vmcnt(9) ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v15 ; GFX9-O0-NEXT: v_sub_co_u32_e32 v13, vcc, v13, v6 -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: s_waitcnt vmcnt(8) ; GFX9-O0-NEXT: v_subb_co_u32_e32 v12, vcc, v12, v10, vcc ; GFX9-O0-NEXT: v_subb_co_u32_e32 v11, vcc, v11, v4, vcc ; GFX9-O0-NEXT: v_subb_co_u32_e32 v7, vcc, v7, v5, vcc @@ -773,13 +788,17 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v15, v12 ; GFX9-O0-NEXT: v_mov_b32_e32 v12, 0 ; GFX9-O0-NEXT: v_mov_b32_e32 v13, 0 +; GFX9-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-O0-NEXT: v_mov_b32_e32 v22, v21 ; GFX9-O0-NEXT: v_and_b32_e64 v22, v7, v22 +; GFX9-O0-NEXT: s_waitcnt vmcnt(6) ; GFX9-O0-NEXT: v_and_b32_e64 v20, v11, v20 ; GFX9-O0-NEXT: ; kill: def $vgpr20 killed $vgpr20 def $vgpr20_vgpr21 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v21, v22 +; GFX9-O0-NEXT: s_waitcnt vmcnt(5) ; GFX9-O0-NEXT: v_mov_b32_e32 v22, v19 ; GFX9-O0-NEXT: v_and_b32_e64 v7, v7, v22 +; GFX9-O0-NEXT: s_waitcnt vmcnt(4) ; GFX9-O0-NEXT: v_and_b32_e64 v22, v11, v18 ; GFX9-O0-NEXT: ; kill: def $vgpr22 killed $vgpr22 def $vgpr22_vgpr23 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v23, v7 @@ -795,12 +814,15 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v7 ; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v10 +; GFX9-O0-NEXT: s_waitcnt vmcnt(3) ; GFX9-O0-NEXT: v_mov_b32_e32 v11, v8 ; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 killed $vgpr8_vgpr9 killed $exec ; GFX9-O0-NEXT: s_mov_b64 s[8:9], -1 ; GFX9-O0-NEXT: s_mov_b32 s5, s8 ; GFX9-O0-NEXT: s_mov_b32 s4, s9 +; GFX9-O0-NEXT: s_waitcnt vmcnt(1) ; GFX9-O0-NEXT: v_mov_b32_e32 v10, v16 +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_mov_b32_e32 v8, v17 ; GFX9-O0-NEXT: v_mov_b32_e32 v16, s5 ; GFX9-O0-NEXT: v_add_co_u32_e32 v19, vcc, v11, v16 @@ -888,29 +910,30 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_waitcnt vmcnt(9) +; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_waitcnt vmcnt(15) ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v10 -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: s_waitcnt vmcnt(13) ; GFX9-O0-NEXT: v_lshrrev_b64 v[6:7], v4, v[18:19] ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v7 ; GFX9-O0-NEXT: s_mov_b32 s4, 64 ; GFX9-O0-NEXT: v_sub_u32_e64 v20, s4, v4 +; GFX9-O0-NEXT: s_waitcnt vmcnt(11) ; GFX9-O0-NEXT: v_lshlrev_b64 v[20:21], v20, v[14:15] ; GFX9-O0-NEXT: v_mov_b32_e32 v22, v21 ; GFX9-O0-NEXT: v_or_b32_e64 v5, v5, v22 @@ -948,12 +971,16 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[6:7] ; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v14 +; GFX9-O0-NEXT: s_waitcnt vmcnt(10) ; GFX9-O0-NEXT: v_mov_b32_e32 v15, v12 +; GFX9-O0-NEXT: s_waitcnt vmcnt(9) ; GFX9-O0-NEXT: v_mov_b32_e32 v12, v13 ; GFX9-O0-NEXT: s_mov_b64 s[8:9], -1 ; GFX9-O0-NEXT: s_mov_b32 s7, s8 ; GFX9-O0-NEXT: s_mov_b32 s6, s9 +; GFX9-O0-NEXT: s_waitcnt vmcnt(8) ; GFX9-O0-NEXT: v_mov_b32_e32 v14, v16 +; GFX9-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-O0-NEXT: v_mov_b32_e32 v13, v17 ; GFX9-O0-NEXT: v_mov_b32_e32 v16, s7 ; GFX9-O0-NEXT: v_add_co_u32_e32 v16, vcc, v15, v16 @@ -989,10 +1016,11 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_nop 0 ; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: s_waitcnt vmcnt(16) ; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(16) ; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: s_waitcnt vmcnt(16) ; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 @@ -1000,39 +1028,43 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 ; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(20) ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: s_waitcnt vmcnt(20) ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(20) ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: s_waitcnt vmcnt(20) ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB0_6 ; GFX9-O0-NEXT: .LBB0_8: ; %udiv-bb1 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] -; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 1 ; GFX9-O0-NEXT: s_mov_b32 s5, s6 -; GFX9-O0-NEXT: s_waitcnt vmcnt(1) +; GFX9-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v0 ; GFX9-O0-NEXT: s_mov_b32 s4, s7 ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0 ; GFX9-O0-NEXT: s_mov_b32 s8, s6 ; GFX9-O0-NEXT: s_mov_b32 s9, s7 +; GFX9-O0-NEXT: s_waitcnt vmcnt(6) ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-O0-NEXT: s_waitcnt vmcnt(5) ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v5 ; GFX9-O0-NEXT: v_mov_b32_e32 v4, s5 ; GFX9-O0-NEXT: v_add_co_u32_e32 v8, vcc, v3, v4 ; GFX9-O0-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: s_waitcnt vmcnt(4) ; GFX9-O0-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc ; GFX9-O0-NEXT: v_mov_b32_e32 v4, s8 ; GFX9-O0-NEXT: v_addc_co_u32_e32 v0, vcc, v0, v4, vcc @@ -1054,10 +1086,12 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b32 s4, 0x7f ; GFX9-O0-NEXT: v_sub_u32_e64 v2, s4, v3 +; GFX9-O0-NEXT: s_waitcnt vmcnt(6) ; GFX9-O0-NEXT: v_lshlrev_b64 v[4:5], v2, v[10:11] ; GFX9-O0-NEXT: v_mov_b32_e32 v12, v5 ; GFX9-O0-NEXT: s_mov_b32 s4, 64 ; GFX9-O0-NEXT: v_sub_u32_e64 v13, s4, v2 +; GFX9-O0-NEXT: s_waitcnt vmcnt(4) ; GFX9-O0-NEXT: v_lshrrev_b64 v[13:14], v13, v[6:7] ; GFX9-O0-NEXT: v_mov_b32_e32 v15, v14 ; GFX9-O0-NEXT: v_or_b32_e64 v12, v12, v15 @@ -1136,6 +1170,14 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_cbranch_execz .LBB0_5 ; GFX9-O0-NEXT: s_branch .LBB0_7 ; GFX9-O0-NEXT: .LBB0_9: ; %udiv-end +; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload @@ -1144,22 +1186,14 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b32 s4, 32 -; GFX9-O0-NEXT: s_waitcnt vmcnt(2) +; GFX9-O0-NEXT: s_waitcnt vmcnt(14) ; GFX9-O0-NEXT: v_lshrrev_b64 v[0:1], s4, v[16:17] ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-O0-NEXT: s_waitcnt vmcnt(1) +; GFX9-O0-NEXT: s_waitcnt vmcnt(13) ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v20 ; GFX9-O0-NEXT: v_mul_lo_u32 v8, v1, v0 -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: s_waitcnt vmcnt(12) ; GFX9-O0-NEXT: v_lshrrev_b64 v[20:21], s4, v[20:21] ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v20 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v16 @@ -1186,10 +1220,13 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_or_b32_e64 v16, v5, v8 ; GFX9-O0-NEXT: ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v17, v0 +; GFX9-O0-NEXT: s_waitcnt vmcnt(10) ; GFX9-O0-NEXT: v_lshrrev_b64 v[8:9], s4, v[18:19] ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v8 +; GFX9-O0-NEXT: s_waitcnt vmcnt(9) ; GFX9-O0-NEXT: v_mov_b32_e32 v8, v14 ; GFX9-O0-NEXT: v_mul_lo_u32 v9, v8, v5 +; GFX9-O0-NEXT: s_waitcnt vmcnt(8) ; GFX9-O0-NEXT: v_lshrrev_b64 v[14:15], s4, v[14:15] ; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 killed $vgpr14_vgpr15 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v18 @@ -1356,7 +1393,9 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v9 ; GFX9-O0-NEXT: v_mov_b32_e32 v9, v0 ; GFX9-O0-NEXT: v_mov_b32_e32 v8, v1 +; GFX9-O0-NEXT: s_waitcnt vmcnt(1) ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v3 +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v4 ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v12 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v13 @@ -1888,47 +1927,52 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_readlane_b32 s4, v30, 2 ; GFX9-O0-NEXT: v_readlane_b32 s5, v30, 3 ; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_waitcnt vmcnt(1) +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_waitcnt vmcnt(3) ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_waitcnt vmcnt(1) +; GFX9-O0-NEXT: s_waitcnt vmcnt(3) ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(3) ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: s_waitcnt vmcnt(3) ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB1_9 ; GFX9-O0-NEXT: .LBB1_4: ; %udiv-loop-exit -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b32 s4, 1 -; GFX9-O0-NEXT: s_waitcnt vmcnt(2) +; GFX9-O0-NEXT: s_waitcnt vmcnt(6) ; GFX9-O0-NEXT: v_lshlrev_b64 v[2:3], s4, v[0:1] -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: s_waitcnt vmcnt(4) ; GFX9-O0-NEXT: v_lshlrev_b64 v[9:10], s4, v[9:10] ; GFX9-O0-NEXT: s_mov_b32 s4, 63 ; GFX9-O0-NEXT: v_lshrrev_b64 v[0:1], s4, v[0:1] ; GFX9-O0-NEXT: v_mov_b32_e32 v11, v1 ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v10 +; GFX9-O0-NEXT: s_waitcnt vmcnt(3) ; GFX9-O0-NEXT: v_mov_b32_e32 v12, v8 ; GFX9-O0-NEXT: v_or3_b32 v4, v4, v11, v12 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v9 +; GFX9-O0-NEXT: s_waitcnt vmcnt(2) ; GFX9-O0-NEXT: v_or3_b32 v0, v0, v1, v7 ; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4 ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-O0-NEXT: s_waitcnt vmcnt(1) ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v6 ; GFX9-O0-NEXT: v_or_b32_e64 v4, v4, v7 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v2 +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v5 ; GFX9-O0-NEXT: v_or_b32_e64 v2, v2, v3 ; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec @@ -1948,26 +1992,29 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_readlane_b32 s4, v30, 6 ; GFX9-O0-NEXT: v_readlane_b32 s5, v30, 7 ; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_waitcnt vmcnt(1) +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_waitcnt vmcnt(1) +; GFX9-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB1_4 ; GFX9-O0-NEXT: .LBB1_6: ; %udiv-do-while @@ -1978,35 +2025,36 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_readlane_b32 s6, v30, 8 ; GFX9-O0-NEXT: v_readlane_b32 s7, v30, 9 +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b32 s4, 63 -; GFX9-O0-NEXT: s_waitcnt vmcnt(16) +; GFX9-O0-NEXT: s_waitcnt vmcnt(22) ; GFX9-O0-NEXT: v_lshrrev_b64 v[28:29], s4, v[2:3] ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v29 ; GFX9-O0-NEXT: s_mov_b32 s5, 1 +; GFX9-O0-NEXT: s_waitcnt vmcnt(20) ; GFX9-O0-NEXT: v_lshlrev_b64 v[22:23], s5, v[22:23] ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v23 ; GFX9-O0-NEXT: v_or_b32_e64 v4, v4, v5 @@ -2016,6 +2064,7 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: ; kill: def $vgpr22 killed $vgpr22 def $vgpr22_vgpr23 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v23, v4 ; GFX9-O0-NEXT: v_lshlrev_b64 v[28:29], s5, v[2:3] +; GFX9-O0-NEXT: s_waitcnt vmcnt(18) ; GFX9-O0-NEXT: v_lshrrev_b64 v[4:5], s4, v[6:7] ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v29 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v5 @@ -2025,25 +2074,28 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_or_b32_e64 v4, v3, v4 ; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v2 +; GFX9-O0-NEXT: s_waitcnt vmcnt(16) ; GFX9-O0-NEXT: v_lshlrev_b64 v[2:3], s5, v[0:1] ; GFX9-O0-NEXT: v_lshlrev_b64 v[28:29], s5, v[6:7] ; GFX9-O0-NEXT: v_lshrrev_b64 v[0:1], s4, v[0:1] ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v1 ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v29 -; GFX9-O0-NEXT: s_waitcnt vmcnt(10) +; GFX9-O0-NEXT: s_waitcnt vmcnt(15) ; GFX9-O0-NEXT: v_mov_b32_e32 v10, v27 ; GFX9-O0-NEXT: v_or3_b32 v6, v6, v7, v10 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v28 +; GFX9-O0-NEXT: s_waitcnt vmcnt(14) ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v26 ; GFX9-O0-NEXT: v_or3_b32 v0, v0, v1, v7 ; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v6 ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v3 -; GFX9-O0-NEXT: s_waitcnt vmcnt(8) +; GFX9-O0-NEXT: s_waitcnt vmcnt(13) ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v25 ; GFX9-O0-NEXT: v_or_b32_e64 v6, v6, v7 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v2 +; GFX9-O0-NEXT: s_waitcnt vmcnt(12) ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v24 ; GFX9-O0-NEXT: v_or_b32_e64 v2, v2, v3 ; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec @@ -2052,12 +2104,14 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v10, v5 ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v22 ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v23 -; GFX9-O0-NEXT: s_waitcnt vmcnt(1) +; GFX9-O0-NEXT: s_waitcnt vmcnt(11) ; GFX9-O0-NEXT: v_mov_b32_e32 v13, v11 +; GFX9-O0-NEXT: s_waitcnt vmcnt(10) ; GFX9-O0-NEXT: v_mov_b32_e32 v11, v14 +; GFX9-O0-NEXT: s_waitcnt vmcnt(9) ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v15 ; GFX9-O0-NEXT: v_sub_co_u32_e32 v13, vcc, v13, v6 -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: s_waitcnt vmcnt(8) ; GFX9-O0-NEXT: v_subb_co_u32_e32 v12, vcc, v12, v10, vcc ; GFX9-O0-NEXT: v_subb_co_u32_e32 v11, vcc, v11, v4, vcc ; GFX9-O0-NEXT: v_subb_co_u32_e32 v7, vcc, v7, v5, vcc @@ -2075,13 +2129,17 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v15, v12 ; GFX9-O0-NEXT: v_mov_b32_e32 v12, 0 ; GFX9-O0-NEXT: v_mov_b32_e32 v13, 0 +; GFX9-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-O0-NEXT: v_mov_b32_e32 v22, v21 ; GFX9-O0-NEXT: v_and_b32_e64 v22, v7, v22 +; GFX9-O0-NEXT: s_waitcnt vmcnt(6) ; GFX9-O0-NEXT: v_and_b32_e64 v20, v11, v20 ; GFX9-O0-NEXT: ; kill: def $vgpr20 killed $vgpr20 def $vgpr20_vgpr21 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v21, v22 +; GFX9-O0-NEXT: s_waitcnt vmcnt(5) ; GFX9-O0-NEXT: v_mov_b32_e32 v22, v19 ; GFX9-O0-NEXT: v_and_b32_e64 v7, v7, v22 +; GFX9-O0-NEXT: s_waitcnt vmcnt(4) ; GFX9-O0-NEXT: v_and_b32_e64 v22, v11, v18 ; GFX9-O0-NEXT: ; kill: def $vgpr22 killed $vgpr22 def $vgpr22_vgpr23 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v23, v7 @@ -2097,12 +2155,15 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v7 ; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v10 +; GFX9-O0-NEXT: s_waitcnt vmcnt(3) ; GFX9-O0-NEXT: v_mov_b32_e32 v11, v8 ; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 killed $vgpr8_vgpr9 killed $exec ; GFX9-O0-NEXT: s_mov_b64 s[8:9], -1 ; GFX9-O0-NEXT: s_mov_b32 s5, s8 ; GFX9-O0-NEXT: s_mov_b32 s4, s9 +; GFX9-O0-NEXT: s_waitcnt vmcnt(1) ; GFX9-O0-NEXT: v_mov_b32_e32 v10, v16 +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_mov_b32_e32 v8, v17 ; GFX9-O0-NEXT: v_mov_b32_e32 v16, s5 ; GFX9-O0-NEXT: v_add_co_u32_e32 v19, vcc, v11, v16 @@ -2190,29 +2251,30 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_waitcnt vmcnt(9) +; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_waitcnt vmcnt(15) ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v10 -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: s_waitcnt vmcnt(13) ; GFX9-O0-NEXT: v_lshrrev_b64 v[6:7], v4, v[18:19] ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v7 ; GFX9-O0-NEXT: s_mov_b32 s4, 64 ; GFX9-O0-NEXT: v_sub_u32_e64 v20, s4, v4 +; GFX9-O0-NEXT: s_waitcnt vmcnt(11) ; GFX9-O0-NEXT: v_lshlrev_b64 v[20:21], v20, v[14:15] ; GFX9-O0-NEXT: v_mov_b32_e32 v22, v21 ; GFX9-O0-NEXT: v_or_b32_e64 v5, v5, v22 @@ -2250,12 +2312,16 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[6:7] ; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v14 +; GFX9-O0-NEXT: s_waitcnt vmcnt(10) ; GFX9-O0-NEXT: v_mov_b32_e32 v15, v12 +; GFX9-O0-NEXT: s_waitcnt vmcnt(9) ; GFX9-O0-NEXT: v_mov_b32_e32 v12, v13 ; GFX9-O0-NEXT: s_mov_b64 s[8:9], -1 ; GFX9-O0-NEXT: s_mov_b32 s7, s8 ; GFX9-O0-NEXT: s_mov_b32 s6, s9 +; GFX9-O0-NEXT: s_waitcnt vmcnt(8) ; GFX9-O0-NEXT: v_mov_b32_e32 v14, v16 +; GFX9-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-O0-NEXT: v_mov_b32_e32 v13, v17 ; GFX9-O0-NEXT: v_mov_b32_e32 v16, s7 ; GFX9-O0-NEXT: v_add_co_u32_e32 v16, vcc, v15, v16 @@ -2291,10 +2357,11 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_nop 0 ; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: s_waitcnt vmcnt(16) ; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(16) ; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: s_waitcnt vmcnt(16) ; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 @@ -2302,39 +2369,43 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 ; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(20) ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: s_waitcnt vmcnt(20) ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(20) ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: s_waitcnt vmcnt(20) ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB1_6 ; GFX9-O0-NEXT: .LBB1_8: ; %udiv-bb1 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] -; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 1 ; GFX9-O0-NEXT: s_mov_b32 s5, s6 -; GFX9-O0-NEXT: s_waitcnt vmcnt(1) +; GFX9-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v0 ; GFX9-O0-NEXT: s_mov_b32 s4, s7 ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0 ; GFX9-O0-NEXT: s_mov_b32 s8, s6 ; GFX9-O0-NEXT: s_mov_b32 s9, s7 +; GFX9-O0-NEXT: s_waitcnt vmcnt(6) ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-O0-NEXT: s_waitcnt vmcnt(5) ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v5 ; GFX9-O0-NEXT: v_mov_b32_e32 v4, s5 ; GFX9-O0-NEXT: v_add_co_u32_e32 v8, vcc, v3, v4 ; GFX9-O0-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: s_waitcnt vmcnt(4) ; GFX9-O0-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc ; GFX9-O0-NEXT: v_mov_b32_e32 v4, s8 ; GFX9-O0-NEXT: v_addc_co_u32_e32 v0, vcc, v0, v4, vcc @@ -2356,10 +2427,12 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b32 s4, 0x7f ; GFX9-O0-NEXT: v_sub_u32_e64 v2, s4, v3 +; GFX9-O0-NEXT: s_waitcnt vmcnt(6) ; GFX9-O0-NEXT: v_lshlrev_b64 v[4:5], v2, v[10:11] ; GFX9-O0-NEXT: v_mov_b32_e32 v12, v5 ; GFX9-O0-NEXT: s_mov_b32 s4, 64 ; GFX9-O0-NEXT: v_sub_u32_e64 v13, s4, v2 +; GFX9-O0-NEXT: s_waitcnt vmcnt(4) ; GFX9-O0-NEXT: v_lshrrev_b64 v[13:14], v13, v[6:7] ; GFX9-O0-NEXT: v_mov_b32_e32 v15, v14 ; GFX9-O0-NEXT: v_or_b32_e64 v12, v12, v15 @@ -2438,26 +2511,26 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_cbranch_execz .LBB1_5 ; GFX9-O0-NEXT: s_branch .LBB1_7 ; GFX9-O0-NEXT: .LBB1_9: ; %udiv-end -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b32 s4, 32 -; GFX9-O0-NEXT: s_waitcnt vmcnt(2) +; GFX9-O0-NEXT: s_waitcnt vmcnt(10) ; GFX9-O0-NEXT: v_lshrrev_b64 v[2:3], s4, v[6:7] ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v2 -; GFX9-O0-NEXT: s_waitcnt vmcnt(1) +; GFX9-O0-NEXT: s_waitcnt vmcnt(9) ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v12 ; GFX9-O0-NEXT: v_mul_lo_u32 v4, v5, v2 -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: s_waitcnt vmcnt(8) ; GFX9-O0-NEXT: v_lshrrev_b64 v[12:13], s4, v[12:13] ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v12 ; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec @@ -2484,10 +2557,13 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_or_b32_e64 v12, v3, v4 ; GFX9-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v13, v2 +; GFX9-O0-NEXT: s_waitcnt vmcnt(6) ; GFX9-O0-NEXT: v_lshrrev_b64 v[2:3], s4, v[14:15] ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v2 +; GFX9-O0-NEXT: s_waitcnt vmcnt(5) ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v10 ; GFX9-O0-NEXT: v_mul_lo_u32 v3, v2, v7 +; GFX9-O0-NEXT: s_waitcnt vmcnt(4) ; GFX9-O0-NEXT: v_lshrrev_b64 v[10:11], s4, v[10:11] ; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 killed $vgpr10_vgpr11 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v14 @@ -2654,7 +2730,9 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v3 ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v5 ; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 killed $vgpr5_vgpr6 killed $exec +; GFX9-O0-NEXT: s_waitcnt vmcnt(1) ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v1 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v8 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v9 diff --git a/llvm/test/CodeGen/AMDGPU/reschedule-bundle-loads.mir b/llvm/test/CodeGen/AMDGPU/reschedule-bundle-loads.mir new file mode 100644 index 0000000000000..f6c6f2d0fb77e --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/reschedule-bundle-loads.mir @@ -0,0 +1,198 @@ +# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -start-before si-post-ra-bundler -o - %s | FileCheck %s + +--- | + target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9" + target triple = "amdgcn-amd-amdpal" + define dllexport amdgpu_cs void @_amdgpu_cs_main(ptr inreg noundef %userdata2, ptr %out0, ptr %out1, ptr %out2, ptr %out3) { + ; CHECK-LABEL: _amdgpu_cs_main: + ; CHECK: ; %bb.0: ; %.entry + ; CHECK-NEXT: ; implicit-def: $vgpr11 + ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) + ; CHECK-NEXT: v_cvt_u32_f32_e32 v15, v11 + ; CHECK-NEXT: ; implicit-def: $vgpr12 + ; CHECK-NEXT: v_cvt_u32_f32_e32 v28, v12 + ; CHECK-NEXT: ; implicit-def: $vgpr16 + ; CHECK-NEXT: ; implicit-def: $vgpr17 + ; CHECK-NEXT: ; implicit-def: $vgpr19 + ; CHECK-NEXT: ; implicit-def: $vgpr22 + ; CHECK-NEXT: ; implicit-def: $vgpr24 + ; CHECK-NEXT: ; implicit-def: $vgpr25 + ; CHECK-NEXT: ; implicit-def: $vgpr26 + ; CHECK-NEXT: ; implicit-def: $vgpr57 + ; CHECK-NEXT: ; implicit-def: $vgpr58 + ; CHECK-NEXT: ; implicit-def: $vgpr59 + ; CHECK-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11 + ; CHECK-NEXT: ; implicit-def: $vgpr6 + ; CHECK-NEXT: v_lshlrev_b32_e32 v56, 3, v6 + ; CHECK-NEXT: s_clause 0x1f + ; CHECK-NEXT: image_load v9, [v57, v58, v19], s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm + ; CHECK-NEXT: image_load v10, [v57, v58, v17], s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm + ; CHECK-NEXT: image_load v12, [v15, v58, v19], s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm + ; CHECK-NEXT: image_load v36, [v15, v58, v17], s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm + ; CHECK-NEXT: image_load v41, [v57, v28, v19], s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm + ; CHECK-NEXT: image_load v42, [v57, v28, v17], s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm + ; CHECK-NEXT: image_load v47, [v15, v28, v19], s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm + ; CHECK-NEXT: image_load v48, [v15, v28, v17], s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm + ; CHECK-NEXT: image_load v11, [v57, v58, v22], s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm + ; CHECK-NEXT: image_load v51, [v57, v58, v16], s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm + ; CHECK-NEXT: image_load v35, [v15, v58, v22], s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm + ; CHECK-NEXT: image_load v52, [v15, v58, v16], s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm + ; CHECK-NEXT: image_load v49, [v57, v28, v22], s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm + ; CHECK-NEXT: image_load v53, [v57, v28, v16], s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm + ; CHECK-NEXT: image_load v50, [v15, v28, v22], s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm + ; CHECK-NEXT: image_load v54, [v15, v28, v16], s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm + ; CHECK-NEXT: image_load v20, v[57:59], s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm + ; CHECK-NEXT: image_load v21, [v57, v58, v24], s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm + ; CHECK-NEXT: image_load v31, [v15, v58, v59], s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm + ; CHECK-NEXT: image_load v32, [v15, v58, v24], s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm + ; CHECK-NEXT: image_load v37, [v57, v28, v59], s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm + ; CHECK-NEXT: image_load v38, [v57, v28, v24], s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm + ; CHECK-NEXT: image_load v43, [v15, v28, v59], s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm + ; CHECK-NEXT: image_load v44, [v15, v28, v24], s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm + ; CHECK-NEXT: image_load v29, [v57, v58, v25], s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm + ; CHECK-NEXT: image_load v30, [v57, v58, v26], s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm + ; CHECK-NEXT: image_load v33, [v15, v58, v25], s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm + ; CHECK-NEXT: image_load v34, [v15, v58, v26], s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm + ; CHECK-NEXT: image_load v39, [v57, v28, v25], s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm + ; CHECK-NEXT: image_load v40, [v57, v28, v26], s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm + ; CHECK-NEXT: image_load v45, [v15, v28, v25], s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm + ; CHECK-NEXT: image_load v46, [v15, v28, v26], s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm + ; CHECK-NEXT: ; implicit-def: $vgpr7 + ; CHECK-NEXT: ; implicit-def: $vgpr0 + ; CHECK-NEXT: ; implicit-def: $vgpr3 + ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v7 + ; CHECK-NEXT: s_waitcnt vmcnt(30) + ; CHECK-NEXT: v_max3_f32 v55, v9, 0, v10 + ; CHECK-NEXT: v_lshrrev_b32_e32 v9, 10, v0 + ; CHECK-NEXT: s_waitcnt vmcnt(28) + ; CHECK-NEXT: v_max3_f32 v36, v12, 0, v36 + ; CHECK-NEXT: v_lshlrev_b32_e32 v12, 1, v7 + ; CHECK-NEXT: s_waitcnt vmcnt(26) + ; CHECK-NEXT: v_max3_f32 v41, v41, 0, v42 + ; CHECK-NEXT: v_lshlrev_b32_e32 v10, 1, v6 + ; CHECK-NEXT: s_waitcnt vmcnt(24) + ; CHECK-NEXT: v_max3_f32 v42, v47, 0, v48 + ; CHECK-NEXT: s_waitcnt vmcnt(22) + ; CHECK-NEXT: v_max3_f32 v47, v55, v11, v51 + ; CHECK-NEXT: v_mad_u32_u24 v11, 0x90, v7, v56 + ; CHECK-NEXT: s_waitcnt vmcnt(20) + ; CHECK-NEXT: v_max3_f32 v35, v36, v35, v52 + ; CHECK-NEXT: s_waitcnt vmcnt(18) + ; CHECK-NEXT: v_max3_f32 v36, v41, v49, v53 + ; CHECK-NEXT: s_waitcnt vmcnt(16) + ; CHECK-NEXT: v_max3_f32 v41, v42, v50, v54 + ; CHECK-NEXT: s_waitcnt vmcnt(14) + ; CHECK-NEXT: v_max3_f32 v20, v47, v20, v21 + ; CHECK-NEXT: v_add_nc_u32_e32 v21, 2, v3 + ; CHECK-NEXT: s_waitcnt vmcnt(12) + ; CHECK-NEXT: v_max3_f32 v31, v35, v31, v32 + ; CHECK-NEXT: s_waitcnt vmcnt(10) + ; CHECK-NEXT: v_max3_f32 v32, v36, v37, v38 + ; CHECK-NEXT: s_waitcnt vmcnt(8) + ; CHECK-NEXT: v_max3_f32 v35, v41, v43, v44 + ; CHECK-NEXT: s_waitcnt vmcnt(6) + ; CHECK-NEXT: v_max3_f32 v29, v20, v29, v30 + ; CHECK-NEXT: v_add_nc_u32_e32 v20, 3, v3 + ; CHECK-NEXT: s_waitcnt vmcnt(4) + ; CHECK-NEXT: v_max3_f32 v30, v31, v33, v34 + ; CHECK-NEXT: s_waitcnt vmcnt(2) + ; CHECK-NEXT: v_max3_f32 v31, v32, v39, v40 + ; CHECK-NEXT: s_waitcnt vmcnt(0) + ; CHECK-NEXT: v_max3_f32 v32, v35, v45, v46 + ; CHECK-NEXT: ds_store_2addr_b32 v11, v29, v31 offset1:1 + ; CHECK-NEXT: ds_store_2addr_b32 v11, v30, v32 offset0:18 offset1:19 + ; CHECK-NEXT: s_endpgm + .entry: + ret void + } + +... +--- +name: _amdgpu_cs_main +exposesReturnsTwice: false +tracksRegLiveness: true +body: | + bb.0..entry: + + $vgpr0 = IMPLICIT_DEF + $vgpr3 = IMPLICIT_DEF + $vgpr6 = IMPLICIT_DEF + $vgpr7 = IMPLICIT_DEF + $vgpr11 = IMPLICIT_DEF + $vgpr12 = IMPLICIT_DEF + $vgpr16 = IMPLICIT_DEF + $vgpr17 = IMPLICIT_DEF + $vgpr19 = IMPLICIT_DEF + $vgpr22 = IMPLICIT_DEF + $vgpr24 = IMPLICIT_DEF + $vgpr25 = IMPLICIT_DEF + $vgpr26 = IMPLICIT_DEF + $vgpr57 = IMPLICIT_DEF + $vgpr58 = IMPLICIT_DEF + $vgpr59 = IMPLICIT_DEF + $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11 = IMPLICIT_DEF + + renamable $vgpr15 = nofpexcept V_CVT_U32_F32_e32 killed $vgpr11, implicit $mode, implicit $exec + renamable $vgpr28 = nofpexcept V_CVT_U32_F32_e32 killed $vgpr12, implicit $mode, implicit $exec + renamable $vgpr20 = IMAGE_LOAD_V1_V3_gfx11 $vgpr57_vgpr58_vgpr59, renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, 1, 6, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8) + renamable $vgpr21 = IMAGE_LOAD_V1_V3_nsa_gfx11 renamable $vgpr57, renamable $vgpr58, renamable $vgpr24, renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, 1, 6, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8) + renamable $vgpr29 = IMAGE_LOAD_V1_V3_nsa_gfx11 renamable $vgpr57, renamable $vgpr58, renamable $vgpr25, renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, 1, 6, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8) + renamable $vgpr30 = IMAGE_LOAD_V1_V3_nsa_gfx11 renamable $vgpr57, renamable $vgpr58, renamable $vgpr26, renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, 1, 6, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8) + renamable $vgpr11 = IMAGE_LOAD_V1_V3_nsa_gfx11 renamable $vgpr57, renamable $vgpr58, renamable $vgpr22, renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, 1, 6, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8) + renamable $vgpr9 = IMAGE_LOAD_V1_V3_nsa_gfx11 renamable $vgpr57, renamable $vgpr58, renamable $vgpr19, renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, 1, 6, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8) + renamable $vgpr10 = IMAGE_LOAD_V1_V3_nsa_gfx11 renamable $vgpr57, renamable $vgpr58, renamable $vgpr17, renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, 1, 6, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8) + renamable $vgpr31 = IMAGE_LOAD_V1_V3_nsa_gfx11 renamable $vgpr15, renamable $vgpr58, renamable $vgpr59, renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, 1, 6, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8) + renamable $vgpr32 = IMAGE_LOAD_V1_V3_nsa_gfx11 renamable $vgpr15, renamable $vgpr58, renamable $vgpr24, renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, 1, 6, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8) + renamable $vgpr33 = IMAGE_LOAD_V1_V3_nsa_gfx11 renamable $vgpr15, renamable $vgpr58, renamable $vgpr25, renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, 1, 6, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8) + renamable $vgpr34 = IMAGE_LOAD_V1_V3_nsa_gfx11 renamable $vgpr15, renamable $vgpr58, renamable $vgpr26, renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, 1, 6, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8) + renamable $vgpr35 = IMAGE_LOAD_V1_V3_nsa_gfx11 renamable $vgpr15, renamable $vgpr58, renamable $vgpr22, renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, 1, 6, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8) + renamable $vgpr12 = IMAGE_LOAD_V1_V3_nsa_gfx11 renamable $vgpr15, renamable $vgpr58, renamable $vgpr19, renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, 1, 6, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8) + renamable $vgpr36 = IMAGE_LOAD_V1_V3_nsa_gfx11 renamable $vgpr15, renamable $vgpr58, renamable $vgpr17, renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, 1, 6, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8) + renamable $vgpr37 = IMAGE_LOAD_V1_V3_nsa_gfx11 renamable $vgpr57, renamable $vgpr28, renamable $vgpr59, renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, 1, 6, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8) + renamable $vgpr38 = IMAGE_LOAD_V1_V3_nsa_gfx11 renamable $vgpr57, renamable $vgpr28, renamable $vgpr24, renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, 1, 6, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8) + renamable $vgpr39 = IMAGE_LOAD_V1_V3_nsa_gfx11 renamable $vgpr57, renamable $vgpr28, renamable $vgpr25, renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, 1, 6, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8) + renamable $vgpr40 = IMAGE_LOAD_V1_V3_nsa_gfx11 renamable $vgpr57, renamable $vgpr28, renamable $vgpr26, renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, 1, 6, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8) + renamable $vgpr41 = IMAGE_LOAD_V1_V3_nsa_gfx11 renamable $vgpr57, renamable $vgpr28, renamable $vgpr19, renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, 1, 6, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8) + renamable $vgpr42 = IMAGE_LOAD_V1_V3_nsa_gfx11 renamable $vgpr57, renamable $vgpr28, renamable $vgpr17, renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, 1, 6, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8) + renamable $vgpr43 = IMAGE_LOAD_V1_V3_nsa_gfx11 renamable $vgpr15, renamable $vgpr28, renamable $vgpr59, renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, 1, 6, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8) + renamable $vgpr44 = IMAGE_LOAD_V1_V3_nsa_gfx11 renamable $vgpr15, renamable $vgpr28, renamable $vgpr24, renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, 1, 6, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8) + renamable $vgpr45 = IMAGE_LOAD_V1_V3_nsa_gfx11 renamable $vgpr15, renamable $vgpr28, renamable $vgpr25, renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, 1, 6, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8) + renamable $vgpr46 = IMAGE_LOAD_V1_V3_nsa_gfx11 renamable $vgpr15, renamable $vgpr28, renamable $vgpr26, renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, 1, 6, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8) + renamable $vgpr47 = IMAGE_LOAD_V1_V3_nsa_gfx11 renamable $vgpr15, renamable $vgpr28, renamable $vgpr19, renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, 1, 6, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8) + renamable $vgpr48 = IMAGE_LOAD_V1_V3_nsa_gfx11 renamable $vgpr15, renamable $vgpr28, renamable $vgpr17, renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, 1, 6, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8) + renamable $vgpr49 = IMAGE_LOAD_V1_V3_nsa_gfx11 renamable $vgpr57, renamable $vgpr28, renamable $vgpr22, renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, 1, 6, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8) + renamable $vgpr50 = IMAGE_LOAD_V1_V3_nsa_gfx11 renamable $vgpr15, renamable $vgpr28, renamable $vgpr22, renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, 1, 6, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8) + renamable $vgpr51 = IMAGE_LOAD_V1_V3_nsa_gfx11 renamable $vgpr57, renamable $vgpr58, renamable $vgpr16, renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, 1, 6, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8) + renamable $vgpr52 = IMAGE_LOAD_V1_V3_nsa_gfx11 renamable $vgpr15, renamable $vgpr58, renamable $vgpr16, renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, 1, 6, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8) + renamable $vgpr53 = IMAGE_LOAD_V1_V3_nsa_gfx11 renamable $vgpr57, renamable $vgpr28, renamable $vgpr16, renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, 1, 6, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8) + renamable $vgpr54 = IMAGE_LOAD_V1_V3_nsa_gfx11 renamable $vgpr15, renamable $vgpr28, renamable $vgpr16, renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, 1, 6, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8) + renamable $vgpr55 = V_MAX3_F32_e64 0, killed $vgpr9, 0, 0, 0, killed $vgpr10, 0, 0, implicit $mode, implicit $exec + renamable $vgpr9 = V_LSHRREV_B32_e32 10, $vgpr0, implicit $exec + renamable $vgpr36 = V_MAX3_F32_e64 0, killed $vgpr12, 0, 0, 0, killed $vgpr36, 0, 0, implicit $mode, implicit $exec + renamable $vgpr56 = V_LSHLREV_B32_e32 3, $vgpr6, implicit $exec + renamable $vgpr41 = V_MAX3_F32_e64 0, killed $vgpr41, 0, 0, 0, killed $vgpr42, 0, 0, implicit $mode, implicit $exec + renamable $vgpr12 = nuw nsw V_LSHLREV_B32_e32 1, $vgpr7, implicit $exec + renamable $vgpr42 = V_MAX3_F32_e64 0, killed $vgpr47, 0, 0, 0, killed $vgpr48, 0, 0, implicit $mode, implicit $exec + renamable $vgpr10 = nuw nsw V_LSHLREV_B32_e32 1, $vgpr6, implicit $exec + V_CMP_EQ_U32_e32 7, $vgpr7, implicit-def $vcc, implicit $exec + renamable $vgpr47 = V_MAX3_F32_e64 0, killed $vgpr55, 0, killed $vgpr11, 0, killed $vgpr51, 0, 0, implicit $mode, implicit $exec + renamable $vgpr35 = V_MAX3_F32_e64 0, killed $vgpr36, 0, killed $vgpr35, 0, killed $vgpr52, 0, 0, implicit $mode, implicit $exec + renamable $vgpr36 = V_MAX3_F32_e64 0, killed $vgpr41, 0, killed $vgpr49, 0, killed $vgpr53, 0, 0, implicit $mode, implicit $exec + renamable $vgpr41 = V_MAX3_F32_e64 0, killed $vgpr42, 0, killed $vgpr50, 0, killed $vgpr54, 0, 0, implicit $mode, implicit $exec + renamable $vgpr11 = V_MAD_U32_U24_e64 144, $vgpr7, killed $vgpr56, 0, implicit $exec + renamable $vgpr20 = V_MAX3_F32_e64 0, killed $vgpr47, 0, killed $vgpr20, 0, killed $vgpr21, 0, 0, implicit $mode, implicit $exec + renamable $vgpr31 = V_MAX3_F32_e64 0, killed $vgpr35, 0, killed $vgpr31, 0, killed $vgpr32, 0, 0, implicit $mode, implicit $exec + renamable $vgpr32 = V_MAX3_F32_e64 0, killed $vgpr36, 0, killed $vgpr37, 0, killed $vgpr38, 0, 0, implicit $mode, implicit $exec + renamable $vgpr35 = V_MAX3_F32_e64 0, killed $vgpr41, 0, killed $vgpr43, 0, killed $vgpr44, 0, 0, implicit $mode, implicit $exec + renamable $vgpr21 = V_ADD_U32_e32 2, $vgpr3, implicit $exec + renamable $vgpr29 = V_MAX3_F32_e64 0, killed $vgpr20, 0, killed $vgpr29, 0, killed $vgpr30, 0, 0, implicit $mode, implicit $exec + renamable $vgpr30 = V_MAX3_F32_e64 0, killed $vgpr31, 0, killed $vgpr33, 0, killed $vgpr34, 0, 0, implicit $mode, implicit $exec + renamable $vgpr31 = V_MAX3_F32_e64 0, killed $vgpr32, 0, killed $vgpr39, 0, killed $vgpr40, 0, 0, implicit $mode, implicit $exec + renamable $vgpr32 = V_MAX3_F32_e64 0, killed $vgpr35, 0, killed $vgpr45, 0, killed $vgpr46, 0, 0, implicit $mode, implicit $exec + renamable $vgpr20 = V_ADD_U32_e32 3, $vgpr3, implicit $exec + DS_WRITE2_B32_gfx9 renamable $vgpr11, killed renamable $vgpr29, killed renamable $vgpr31, 0, 1, 0, implicit $exec :: (store (s32) into %ir.out0, addrspace 3), (store (s32) into %ir.out1, addrspace 3) + DS_WRITE2_B32_gfx9 renamable $vgpr11, killed renamable $vgpr30, killed renamable $vgpr32, 18, 19, 0, implicit $exec :: (store (s32) into %ir.out2, addrspace 3), (store (s32) into %ir.out3, addrspace 3) + S_ENDPGM 0 + +... diff --git a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll index 50056b62b3397..a4a4c33ccfe3e 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll @@ -9866,11 +9866,11 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[5:6], s[4:7], 0 addr64 offset:16 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 -; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[5:6], s[4:7], 0 addr64 offset:16 ; GFX6-NEXT: s_mov_b32 s2, 0x83200 -; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(1) ; GFX6-NEXT: buffer_store_dword v7, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v8, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v9, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill @@ -10324,23 +10324,22 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2030 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[38:39] offset:160 ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[19:22], v5, s[38:39] offset:192 ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[15:18], v5, s[38:39] offset:176 -; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[38:39] offset:160 ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v5, s[38:39] offset:144 ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2020 -; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(1) +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(3) ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2070 -; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(1) +; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[38:39] offset:112 +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(2) ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v5, s[38:39] offset:128 -; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[38:39] offset:112 ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2010 -; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(1) +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20c0 -; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(1) ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[38:39] offset:96 ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20b0 @@ -10358,10 +10357,10 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2080 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill -; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[11:14], v5, s[38:39] offset:32 ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v5, s[38:39] offset:16 +; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[11:14], v5, s[38:39] offset:32 ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2060 -; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(1) ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[7:10], v5, s[38:39] ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, 1 @@ -10499,6 +10498,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX10-FLATSCR-NEXT: v_lshlrev_b32_e32 v5, 8, v0 ; GFX10-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLATSCR-NEXT: s_clause 0xf +; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[38:39] ; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[35:38], v5, s[38:39] offset:240 ; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[31:34], v5, s[38:39] offset:224 ; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[27:30], v5, s[38:39] offset:208 @@ -10514,8 +10514,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[47:50], v5, s[38:39] offset:48 ; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[43:46], v5, s[38:39] offset:32 ; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[39:42], v5, s[38:39] offset:16 -; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[38:39] -; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(15) ; GFX10-FLATSCR-NEXT: v_lshl_add_u32 v4, v0, 13, 16 ; GFX10-FLATSCR-NEXT: scratch_store_dword v4, v6, off ; GFX10-FLATSCR-NEXT: ;;#ASMSTART @@ -10546,6 +10545,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX10-FLATSCR-NEXT: ; use s[0:7],s[8:15],s[16:23],s[24:31],s[40:43],s[34:35] ; GFX10-FLATSCR-NEXT: ;;#ASMEND ; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x2010 +; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(4) ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v88, v58 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v92, v62 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v87, v57 @@ -10564,6 +10564,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v66, v36 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v65, v35 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v36, v10 +; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0) ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v72, v42 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v76, v46 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v80, v50 diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-fadd.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-fadd.ll index bce7c1e5e8ab7..51fc72be41f36 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-reduce-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-fadd.ll @@ -2080,8 +2080,8 @@ define double @test_vector_reduce_fadd_v16double(double %sp, <16 x double> %v) { ; GFX9-SDAG-LABEL: test_vector_reduce_fadd_v16double: ; GFX9-SDAG: ; %bb.0: ; %entry ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: scratch_load_dword v33, off, s32 offset:8 ; GFX9-SDAG-NEXT: scratch_load_dword v31, off, s32 +; GFX9-SDAG-NEXT: scratch_load_dword v33, off, s32 offset:8 ; GFX9-SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4 ; GFX9-SDAG-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] ; GFX9-SDAG-NEXT: v_add_f64 v[0:1], v[0:1], v[4:5] @@ -2097,7 +2097,7 @@ define double @test_vector_reduce_fadd_v16double(double %sp, <16 x double> %v) { ; GFX9-SDAG-NEXT: v_add_f64 v[0:1], v[0:1], v[24:25] ; GFX9-SDAG-NEXT: v_add_f64 v[0:1], v[0:1], v[26:27] ; GFX9-SDAG-NEXT: v_add_f64 v[0:1], v[0:1], v[28:29] -; GFX9-SDAG-NEXT: s_waitcnt vmcnt(1) +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(2) ; GFX9-SDAG-NEXT: v_add_f64 v[0:1], v[0:1], v[30:31] ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX9-SDAG-NEXT: v_add_f64 v[0:1], v[0:1], v[32:33] diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-fmul.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-fmul.ll index 657fe0f0804f3..3b8c3de3e5433 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-reduce-fmul.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-fmul.ll @@ -2080,8 +2080,8 @@ define double @test_vector_reduce_fmul_v16double(double %sp, <16 x double> %v) { ; GFX9-SDAG-LABEL: test_vector_reduce_fmul_v16double: ; GFX9-SDAG: ; %bb.0: ; %entry ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: scratch_load_dword v33, off, s32 offset:8 ; GFX9-SDAG-NEXT: scratch_load_dword v31, off, s32 +; GFX9-SDAG-NEXT: scratch_load_dword v33, off, s32 offset:8 ; GFX9-SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4 ; GFX9-SDAG-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] ; GFX9-SDAG-NEXT: v_mul_f64 v[0:1], v[0:1], v[4:5] @@ -2097,7 +2097,7 @@ define double @test_vector_reduce_fmul_v16double(double %sp, <16 x double> %v) { ; GFX9-SDAG-NEXT: v_mul_f64 v[0:1], v[0:1], v[24:25] ; GFX9-SDAG-NEXT: v_mul_f64 v[0:1], v[0:1], v[26:27] ; GFX9-SDAG-NEXT: v_mul_f64 v[0:1], v[0:1], v[28:29] -; GFX9-SDAG-NEXT: s_waitcnt vmcnt(1) +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(2) ; GFX9-SDAG-NEXT: v_mul_f64 v[0:1], v[0:1], v[30:31] ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX9-SDAG-NEXT: v_mul_f64 v[0:1], v[0:1], v[32:33] diff --git a/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll b/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll index a42c8ac706d27..ab6df6462816a 100644 --- a/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll +++ b/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll @@ -384,8 +384,8 @@ define amdgpu_gfx_whole_wave i32 @csr(i1 %active, i32 %a, i32 %b) { ; DAGISEL-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; DAGISEL-NEXT: s_xor_b32 exec_lo, vcc_lo, -1 ; DAGISEL-NEXT: s_clause 0x3 -; DAGISEL-NEXT: scratch_load_b32 v2, off, s32 ; DAGISEL-NEXT: scratch_load_b32 v0, off, s32 offset:4 +; DAGISEL-NEXT: scratch_load_b32 v2, off, s32 ; DAGISEL-NEXT: scratch_load_b32 v1, off, s32 offset:8 ; DAGISEL-NEXT: scratch_load_b32 v49, off, s32 offset:16 ; DAGISEL-NEXT: s_mov_b32 exec_lo, vcc_lo @@ -423,8 +423,8 @@ define amdgpu_gfx_whole_wave i32 @csr(i1 %active, i32 %a, i32 %b) { ; GISEL-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; GISEL-NEXT: s_xor_b32 exec_lo, vcc_lo, -1 ; GISEL-NEXT: s_clause 0x3 -; GISEL-NEXT: scratch_load_b32 v2, off, s32 ; GISEL-NEXT: scratch_load_b32 v0, off, s32 offset:4 +; GISEL-NEXT: scratch_load_b32 v2, off, s32 ; GISEL-NEXT: scratch_load_b32 v1, off, s32 offset:8 ; GISEL-NEXT: scratch_load_b32 v49, off, s32 offset:16 ; GISEL-NEXT: s_mov_b32 exec_lo, vcc_lo @@ -463,8 +463,8 @@ define amdgpu_gfx_whole_wave i32 @csr(i1 %active, i32 %a, i32 %b) { ; DAGISEL64-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; DAGISEL64-NEXT: s_xor_b64 exec, vcc, -1 ; DAGISEL64-NEXT: s_clause 0x3 -; DAGISEL64-NEXT: scratch_load_b32 v2, off, s32 ; DAGISEL64-NEXT: scratch_load_b32 v0, off, s32 offset:4 +; DAGISEL64-NEXT: scratch_load_b32 v2, off, s32 ; DAGISEL64-NEXT: scratch_load_b32 v1, off, s32 offset:8 ; DAGISEL64-NEXT: scratch_load_b32 v49, off, s32 offset:16 ; DAGISEL64-NEXT: s_mov_b64 exec, vcc @@ -503,8 +503,8 @@ define amdgpu_gfx_whole_wave i32 @csr(i1 %active, i32 %a, i32 %b) { ; GISEL64-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; GISEL64-NEXT: s_xor_b64 exec, vcc, -1 ; GISEL64-NEXT: s_clause 0x3 -; GISEL64-NEXT: scratch_load_b32 v2, off, s32 ; GISEL64-NEXT: scratch_load_b32 v0, off, s32 offset:4 +; GISEL64-NEXT: scratch_load_b32 v2, off, s32 ; GISEL64-NEXT: scratch_load_b32 v1, off, s32 offset:8 ; GISEL64-NEXT: scratch_load_b32 v49, off, s32 offset:16 ; GISEL64-NEXT: s_mov_b64 exec, vcc @@ -541,8 +541,8 @@ define amdgpu_gfx_whole_wave i32 @csr(i1 %active, i32 %a, i32 %b) { ; GFX1250-DAGISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-DAGISEL-NEXT: s_xor_b32 exec_lo, vcc_lo, -1 ; GFX1250-DAGISEL-NEXT: s_clause 0x3 -; GFX1250-DAGISEL-NEXT: scratch_load_b32 v2, off, s32 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v0, off, s32 offset:4 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v2, off, s32 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v1, off, s32 offset:8 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v49, off, s32 offset:16 ; GFX1250-DAGISEL-NEXT: s_wait_xcnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll index 74e9ab718c3d2..97be3c7e3b806 100644 --- a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll @@ -188,9 +188,9 @@ define amdgpu_gfx void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O0-NEXT: s_cbranch_execz .LBB1_2 ; GFX9-O0-NEXT: ; %bb.1: ; %if -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_waitcnt vmcnt(1) ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, 0 @@ -1027,34 +1027,37 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt ; GFX9-O0-NEXT: s_waitcnt vmcnt(1) ; GFX9-O0-NEXT: v_mov_b32_e32 v30, v36 ; GFX9-O0-NEXT: ; kill: def $vgpr31 killed $vgpr35 killed $exec +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX9-O0-NEXT: s_waitcnt vmcnt(4) +; GFX9-O0-NEXT: s_waitcnt vmcnt(15) ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v5 ; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec +; GFX9-O0-NEXT: s_waitcnt vmcnt(13) ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v7 ; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec +; GFX9-O0-NEXT: s_waitcnt vmcnt(11) ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v9 ; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec +; GFX9-O0-NEXT: s_waitcnt vmcnt(9) ; GFX9-O0-NEXT: v_mov_b32_e32 v9, v11 ; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-O0-NEXT: v_mov_b32_e32 v11, v0 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v11 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1 @@ -1135,6 +1138,7 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v33 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v34 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v11 +; GFX9-O0-NEXT: s_waitcnt vmcnt(6) ; GFX9-O0-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:4 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v10 ; GFX9-O0-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll index 6347a3783c9c6..d3f646585e4f3 100644 --- a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll +++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll @@ -165,9 +165,9 @@ define amdgpu_cs void @cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) { ; GFX9-O0-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-O0-NEXT: s_cbranch_execz .LBB1_2 ; GFX9-O0-NEXT: ; %bb.1: ; %if -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[16:19], 0 offset:12 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v4, off, s[16:19], 0 offset:16 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[16:19], 0 offset:12 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_waitcnt vmcnt(1) ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, 0 @@ -997,9 +997,9 @@ define amdgpu_cs void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) { ; GFX9-O0-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-O0-NEXT: s_cbranch_execz .LBB8_2 ; GFX9-O0-NEXT: ; %bb.1: ; %if -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[16:19], 0 offset:12 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v4, off, s[16:19], 0 offset:16 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[16:19], 0 offset:12 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_waitcnt vmcnt(1) ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, 0